• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_buffer.h"
6 #include "nvk_cmd_buffer.h"
7 #include "nvk_descriptor_set.h"
8 #include "nvk_device.h"
9 #include "nvk_entrypoints.h"
10 #include "nvk_mme.h"
11 #include "nvk_physical_device.h"
12 #include "nvk_shader.h"
13 
14 #include "nouveau_context.h"
15 
16 #include "cla0b5.h"
17 #include "cla1c0.h"
18 #include "clc0c0.h"
19 #include "clc5c0.h"
20 #include "nvk_cl90c0.h"
21 #include "nvk_cl9097.h"
22 #include "nvk_cla0c0.h"
23 #include "nvk_clb0c0.h"
24 #include "nvk_clb1c0.h"
25 #include "nvk_clc3c0.h"
26 #include "nvk_clc597.h"
27 #include "nvk_clc6c0.h"
28 
29 #include "drf.h"
30 #include "cla0c0qmd.h"
31 #include "clc0c0qmd.h"
32 #include "clc3c0qmd.h"
33 #include "clc6c0qmd.h"
34 
35 #define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a)
36 #define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a)
37 #define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a)
38 #define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a)
39 #define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
40 #define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)
41 #define NVC6C0_QMDV03_00_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC6C0, QMDV03_00, ##a)
42 #define NVC6C0_QMDV03_00_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC6C0, QMDV03_00, ##a)
43 
44 #define QMD_DEF_SET(qmd, class_id, version_major, version_minor, a...) \
45    NVDEF_MW_SET((qmd), NV##class_id, QMDV##version_major##_##version_minor, ##a)
46 #define QMD_VAL_SET(qmd, class_id, version_major, version_minor, a...) \
47    NVVAL_MW_SET((qmd), NV##class_id, QMDV##version_major##_##version_minor, ##a)
48 
49 VkResult
nvk_push_dispatch_state_init(struct nvk_device * dev,struct nv_push * p)50 nvk_push_dispatch_state_init(struct nvk_device *dev, struct nv_push *p)
51 {
52    struct nvk_physical_device *pdev = nvk_device_physical(dev);
53 
54    P_MTHD(p, NV90C0, SET_OBJECT);
55    P_NV90C0_SET_OBJECT(p, {
56       .class_id = pdev->info.cls_compute,
57       .engine_id = 0,
58    });
59 
60    if (pdev->info.cls_compute == MAXWELL_COMPUTE_A)
61       P_IMMD(p, NVB0C0, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
62 
63    if (pdev->info.cls_eng3d < VOLTA_COMPUTE_A) {
64       uint64_t shader_base_addr =
65          nvk_heap_contiguous_base_address(&dev->shader_heap);
66 
67       P_MTHD(p, NVA0C0, SET_PROGRAM_REGION_A);
68       P_NVA0C0_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
69       P_NVA0C0_SET_PROGRAM_REGION_B(p, shader_base_addr);
70    }
71 
72    return VK_SUCCESS;
73 }
74 
75 static inline uint16_t
nvk_cmd_buffer_compute_cls(struct nvk_cmd_buffer * cmd)76 nvk_cmd_buffer_compute_cls(struct nvk_cmd_buffer *cmd)
77 {
78    return nvk_cmd_buffer_device(cmd)->pdev->info.cls_compute;
79 }
80 
81 void
nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)82 nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd,
83                              const VkCommandBufferBeginInfo *pBeginInfo)
84 {
85    if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
86       struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
87       if (nvk_cmd_buffer_compute_cls(cmd) >= MAXWELL_COMPUTE_B) {
88          P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
89       }
90       P_IMMD(p, NVA0C0, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
91          .lines = LINES_ALL,
92       });
93       P_IMMD(p, NVA0C0, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
94          .lines = LINES_ALL,
95       });
96    }
97 }
98 
99 void
nvk_cmd_invalidate_compute_state(struct nvk_cmd_buffer * cmd)100 nvk_cmd_invalidate_compute_state(struct nvk_cmd_buffer *cmd)
101 {
102    memset(&cmd->state.cs, 0, sizeof(cmd->state.cs));
103 }
104 
105 static int
gv100_sm_config_smem_size(uint32_t size)106 gv100_sm_config_smem_size(uint32_t size)
107 {
108    if      (size > 64 * 1024) size = 96 * 1024;
109    else if (size > 32 * 1024) size = 64 * 1024;
110    else if (size > 16 * 1024) size = 32 * 1024;
111    else if (size >  8 * 1024) size = 16 * 1024;
112    else                       size =  8 * 1024;
113    return (size / 4096) + 1;
114 }
115 
116 #define nvk_qmd_init_base(qmd, shader, class_id, version_major, version_minor)   \
117 do {                                                                                                   \
118    QMD_DEF_SET(qmd, class_id, version_major, version_minor, API_VISIBLE_CALL_LIMIT, NO_CHECK);         \
119    QMD_VAL_SET(qmd, class_id, version_major, version_minor, BARRIER_COUNT, shader->info.num_barriers);      \
120    QMD_VAL_SET(qmd, class_id, version_major, version_minor, CTA_THREAD_DIMENSION0,                     \
121                                                             shader->info.cs.local_size[0]);                 \
122    QMD_VAL_SET(qmd, class_id, version_major, version_minor, CTA_THREAD_DIMENSION1,                     \
123                                                             shader->info.cs.local_size[1]);                 \
124    QMD_VAL_SET(qmd, class_id, version_major, version_minor, CTA_THREAD_DIMENSION2,                     \
125                                                             shader->info.cs.local_size[2]);                 \
126    QMD_VAL_SET(qmd, class_id, version_major, version_minor, QMD_MAJOR_VERSION, version_major);         \
127    QMD_VAL_SET(qmd, class_id, version_major, version_minor, QMD_VERSION, version_minor);               \
128    QMD_DEF_SET(qmd, class_id, version_major, version_minor, SAMPLER_INDEX, INDEPENDENTLY);             \
129    QMD_VAL_SET(qmd, class_id, version_major, version_minor, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);         \
130    QMD_VAL_SET(qmd, class_id, version_major, version_minor, SHADER_LOCAL_MEMORY_LOW_SIZE,              \
131                                                             align(shader->info.slm_size, 0x10));            \
132    QMD_VAL_SET(qmd, class_id, version_major, version_minor, SHARED_MEMORY_SIZE,                        \
133                                                             align(shader->info.cs.smem_size, 0x100));       \
134 } while (0)
135 
136 static void
nva0c0_qmd_init(uint32_t * qmd,const struct nvk_shader * shader)137 nva0c0_qmd_init(uint32_t *qmd, const struct nvk_shader *shader)
138 {
139    nvk_qmd_init_base(qmd, shader, A0C0, 00, 06);
140 
141    if (shader->info.cs.smem_size <= (16 << 10))
142       NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB);
143    else if (shader->info.cs.smem_size <= (32 << 10))
144       NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB);
145    else if (shader->info.cs.smem_size <= (48 << 10))
146       NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
147    else
148       unreachable("Invalid shared memory size");
149 
150    uint64_t addr = shader->hdr_addr;
151    assert(addr < 0xffffffff);
152    NVA0C0_QMDV00_06_VAL_SET(qmd, PROGRAM_OFFSET, addr);
153    NVA0C0_QMDV00_06_VAL_SET(qmd, REGISTER_COUNT, shader->info.num_gprs);
154    NVA0C0_QMDV00_06_VAL_SET(qmd, SASS_VERSION, 0x30);
155 }
156 
157 static void
nvc0c0_qmd_init(uint32_t * qmd,const struct nvk_shader * shader)158 nvc0c0_qmd_init(uint32_t *qmd, const struct nvk_shader *shader)
159 {
160    nvk_qmd_init_base(qmd, shader, C0C0, 02, 01);
161 
162    uint64_t addr = shader->hdr_addr;
163    assert(addr < 0xffffffff);
164 
165    NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
166    NVC0C0_QMDV02_01_VAL_SET(qmd, PROGRAM_OFFSET, addr);
167    NVC0C0_QMDV02_01_VAL_SET(qmd, REGISTER_COUNT, shader->info.num_gprs);
168 }
169 
170 static void
nvc3c0_qmd_init(uint32_t * qmd,const struct nvk_shader * shader)171 nvc3c0_qmd_init(uint32_t *qmd, const struct nvk_shader *shader)
172 {
173    nvk_qmd_init_base(qmd, shader, C3C0, 02, 02);
174 
175    NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
176    /* those are all QMD 2.2+ */
177    NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
178                             gv100_sm_config_smem_size(shader->info.cs.smem_size));
179    NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
180                             gv100_sm_config_smem_size(NVK_MAX_SHARED_SIZE));
181    NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
182                             gv100_sm_config_smem_size(shader->info.cs.smem_size));
183 
184    NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, shader->info.num_gprs);
185 
186    uint64_t addr = shader->hdr_addr;
187    NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, addr & 0xffffffff);
188    NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, addr >> 32);
189 }
190 
191 static void
nvc6c0_qmd_init(uint32_t * qmd,const struct nvk_shader * shader)192 nvc6c0_qmd_init(uint32_t *qmd, const struct nvk_shader *shader)
193 {
194    nvk_qmd_init_base(qmd, shader, C6C0, 03, 00);
195 
196    NVC6C0_QMDV03_00_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
197    /* those are all QMD 2.2+ */
198    NVC6C0_QMDV03_00_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
199                             gv100_sm_config_smem_size(shader->info.cs.smem_size));
200    NVC6C0_QMDV03_00_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
201                             gv100_sm_config_smem_size(NVK_MAX_SHARED_SIZE));
202    NVC6C0_QMDV03_00_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
203                             gv100_sm_config_smem_size(shader->info.cs.smem_size));
204 
205    NVC6C0_QMDV03_00_VAL_SET(qmd, REGISTER_COUNT_V, shader->info.num_gprs);
206 
207    uint64_t addr = shader->hdr_addr;
208    NVC6C0_QMDV03_00_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, addr & 0xffffffff);
209    NVC6C0_QMDV03_00_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, addr >> 32);
210 }
211 
212 static void
nvk_qmd_init(struct nvk_physical_device * pdev,uint32_t * qmd,const struct nvk_shader * shader)213 nvk_qmd_init(struct nvk_physical_device *pdev,
214              uint32_t *qmd, const struct nvk_shader *shader)
215 {
216    if (pdev->info.cls_compute >= AMPERE_COMPUTE_A)
217       nvc6c0_qmd_init(qmd, shader);
218    else if (pdev->info.cls_compute >= VOLTA_COMPUTE_A)
219       nvc3c0_qmd_init(qmd, shader);
220    else if (pdev->info.cls_compute >= PASCAL_COMPUTE_A)
221       nvc0c0_qmd_init(qmd, shader);
222    else if (pdev->info.cls_compute >= KEPLER_COMPUTE_A)
223       nva0c0_qmd_init(qmd, shader);
224    else
225       unreachable("Unknown GPU generation");
226 }
227 
228 static void
nva0c0_qmd_set_dispatch_size(UNUSED struct nvk_device * dev,uint32_t * qmd,uint32_t x,uint32_t y,uint32_t z)229 nva0c0_qmd_set_dispatch_size(UNUSED struct nvk_device *dev, uint32_t *qmd,
230                              uint32_t x, uint32_t y, uint32_t z)
231 {
232    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_WIDTH, x);
233    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_HEIGHT, y);
234    NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_DEPTH, z);
235 }
236 
237 static void
nvc0c0_qmd_set_dispatch_size(UNUSED struct nvk_device * dev,uint32_t * qmd,uint32_t x,uint32_t y,uint32_t z)238 nvc0c0_qmd_set_dispatch_size(UNUSED struct nvk_device *dev, uint32_t *qmd,
239                              uint32_t x, uint32_t y, uint32_t z)
240 {
241    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_WIDTH, x);
242    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_HEIGHT, y);
243    /* this field is different from older QMD versions */
244    NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_DEPTH, z);
245 }
246 
247 static void
nvc6c0_qmd_set_dispatch_size(UNUSED struct nvk_device * dev,uint32_t * qmd,uint32_t x,uint32_t y,uint32_t z)248 nvc6c0_qmd_set_dispatch_size(UNUSED struct nvk_device *dev, uint32_t *qmd,
249                              uint32_t x, uint32_t y, uint32_t z)
250 {
251    NVC6C0_QMDV03_00_VAL_SET(qmd, CTA_RASTER_WIDTH, x);
252    NVC6C0_QMDV03_00_VAL_SET(qmd, CTA_RASTER_HEIGHT, y);
253    /* this field is different from older QMD versions */
254    NVC6C0_QMDV03_00_VAL_SET(qmd, CTA_RASTER_DEPTH, z);
255 }
256 
257 static uint32_t
qmd_dispatch_size_offset(const struct nv_device_info * devinfo)258 qmd_dispatch_size_offset(const struct nv_device_info *devinfo)
259 {
260    assert(devinfo->cls_compute >= VOLTA_COMPUTE_A);
261    uint32_t bit = DRF_LO(DRF_MW(NVC3C0_QMDV02_02_CTA_RASTER_WIDTH));
262    assert(bit % 32 == 0);
263    assert(DRF_LO(DRF_MW(NVC3C0_QMDV02_02_CTA_RASTER_HEIGHT)) == bit + 32);
264    assert(DRF_LO(DRF_MW(NVC3C0_QMDV02_02_CTA_RASTER_DEPTH)) == bit + 64);
265    return bit / 8;
266 }
267 
268 static inline void
nva0c0_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,uint32_t size,uint64_t address)269 nva0c0_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
270                              uint32_t size, uint64_t address)
271 {
272    NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
273    NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
274    NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_SIZE, index, size);
275    NVA0C0_QMDV00_06_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
276 }
277 
278 static inline void
nvc0c0_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,uint32_t size,uint64_t address)279 nvc0c0_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
280                              uint32_t size, uint64_t address)
281 {
282    NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
283    NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
284    NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index,
285                             DIV_ROUND_UP(size, 16));
286    NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
287 }
288 
289 static inline void
nvc6c0_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,uint32_t size,uint64_t address)290 nvc6c0_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
291                              uint32_t size, uint64_t address)
292 {
293    NVC6C0_QMDV03_00_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
294    NVC6C0_QMDV03_00_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
295    NVC6C0_QMDV03_00_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index,
296                             DIV_ROUND_UP(size, 16));
297    NVC6C0_QMDV03_00_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
298 }
299 
300 
301 void
nvk_cmd_bind_compute_shader(struct nvk_cmd_buffer * cmd,struct nvk_shader * shader)302 nvk_cmd_bind_compute_shader(struct nvk_cmd_buffer *cmd,
303                             struct nvk_shader *shader)
304 {
305    cmd->state.cs.shader = shader;
306 }
307 
308 static uint32_t
nvk_compute_local_size(struct nvk_cmd_buffer * cmd)309 nvk_compute_local_size(struct nvk_cmd_buffer *cmd)
310 {
311    const struct nvk_shader *shader = cmd->state.cs.shader;
312 
313    return shader->info.cs.local_size[0] *
314           shader->info.cs.local_size[1] *
315           shader->info.cs.local_size[2];
316 }
317 
318 static uint64_t
nvk_flush_compute_state(struct nvk_cmd_buffer * cmd,uint64_t * root_desc_addr_out)319 nvk_flush_compute_state(struct nvk_cmd_buffer *cmd,
320                         uint64_t *root_desc_addr_out)
321 {
322    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
323    struct nvk_physical_device *pdev = nvk_device_physical(dev);
324    const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
325    const struct nvk_shader *shader = cmd->state.cs.shader;
326    struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
327    VkResult result;
328 
329    nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
330 
331    /* pre Pascal the constant buffer sizes need to be 0x100 aligned. As we
332     * simply allocated a buffer and upload data to it, make sure its size is
333     * 0x100 aligned.
334     */
335    STATIC_ASSERT((sizeof(desc->root) & 0xff) == 0);
336    assert(sizeof(desc->root) % min_cbuf_alignment == 0);
337 
338    void *root_desc_map;
339    uint64_t root_desc_addr;
340    result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(desc->root),
341                                         min_cbuf_alignment,
342                                         &root_desc_addr, &root_desc_map);
343    if (unlikely(result != VK_SUCCESS)) {
344       vk_command_buffer_set_error(&cmd->vk, result);
345       return 0;
346    }
347 
348    desc->root.root_desc_addr = root_desc_addr;
349    memcpy(root_desc_map, &desc->root, sizeof(desc->root));
350 
351    uint32_t qmd[128];
352    memset(qmd, 0, sizeof(qmd));
353    nvk_qmd_init(pdev, qmd, shader);
354 
355    if (nvk_cmd_buffer_compute_cls(cmd) >= AMPERE_COMPUTE_A) {
356       nvc6c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd,
357                                    desc->root.cs.group_count[0],
358                                    desc->root.cs.group_count[1],
359                                    desc->root.cs.group_count[2]);
360    } else if (nvk_cmd_buffer_compute_cls(cmd) >= PASCAL_COMPUTE_A) {
361       nvc0c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd,
362                                    desc->root.cs.group_count[0],
363                                    desc->root.cs.group_count[1],
364                                    desc->root.cs.group_count[2]);
365    } else {
366       assert(nvk_cmd_buffer_compute_cls(cmd) >= KEPLER_COMPUTE_A);
367       nva0c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd,
368                                    desc->root.cs.group_count[0],
369                                    desc->root.cs.group_count[1],
370                                    desc->root.cs.group_count[2]);
371    }
372 
373    for (uint32_t c = 0; c < shader->cbuf_map.cbuf_count; c++) {
374       const struct nvk_cbuf *cbuf = &shader->cbuf_map.cbufs[c];
375 
376       struct nvk_buffer_address ba;
377       if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) {
378          ba = (struct nvk_buffer_address) {
379             .base_addr = root_desc_addr,
380             .size = sizeof(desc->root),
381          };
382       } else {
383          ASSERTED bool direct_descriptor =
384             nvk_cmd_buffer_get_cbuf_descriptor(cmd, desc, shader, cbuf, &ba);
385          assert(direct_descriptor);
386       }
387 
388       if (ba.size > 0) {
389          assert(ba.base_addr % min_cbuf_alignment == 0);
390          ba.size = align(ba.size, min_cbuf_alignment);
391          ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
392 
393          if (nvk_cmd_buffer_compute_cls(cmd) >= AMPERE_COMPUTE_A) {
394             nvc6c0_cp_launch_desc_set_cb(qmd, c, ba.size, ba.base_addr);
395          } else if (nvk_cmd_buffer_compute_cls(cmd) >= PASCAL_COMPUTE_A) {
396             nvc0c0_cp_launch_desc_set_cb(qmd, c, ba.size, ba.base_addr);
397          } else {
398             assert(nvk_cmd_buffer_compute_cls(cmd) >= KEPLER_COMPUTE_A);
399             nva0c0_cp_launch_desc_set_cb(qmd, c, ba.size, ba.base_addr);
400          }
401       }
402    }
403 
404    uint64_t qmd_addr;
405    result = nvk_cmd_buffer_upload_data(cmd, qmd, sizeof(qmd), 256, &qmd_addr);
406    if (unlikely(result != VK_SUCCESS)) {
407       vk_command_buffer_set_error(&cmd->vk, result);
408       return 0;
409    }
410 
411    if (root_desc_addr_out != NULL)
412       *root_desc_addr_out = root_desc_addr;
413 
414    return qmd_addr;
415 }
416 
417 static void
nvk_build_mme_add_cs_invocations(struct mme_builder * b,struct mme_value64 count)418 nvk_build_mme_add_cs_invocations(struct mme_builder *b,
419                                  struct mme_value64 count)
420 {
421    struct mme_value accum_hi = nvk_mme_load_scratch(b, CS_INVOCATIONS_HI);
422    struct mme_value accum_lo = nvk_mme_load_scratch(b, CS_INVOCATIONS_LO);
423    struct mme_value64 accum = mme_value64(accum_lo, accum_hi);
424 
425    accum = mme_add64(b, accum, count);
426 
427    STATIC_ASSERT(NVK_MME_SCRATCH_CS_INVOCATIONS_HI + 1 ==
428                  NVK_MME_SCRATCH_CS_INVOCATIONS_LO);
429 
430    mme_mthd(b, NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI));
431    mme_emit(b, accum.hi);
432    mme_emit(b, accum.lo);
433 }
434 
435 void
nvk_mme_add_cs_invocations(struct mme_builder * b)436 nvk_mme_add_cs_invocations(struct mme_builder *b)
437 {
438    struct mme_value64 count = mme_load_addr64(b);
439 
440    nvk_build_mme_add_cs_invocations(b, count);
441 }
442 
443 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)444 nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,
445                     uint32_t baseGroupX,
446                     uint32_t baseGroupY,
447                     uint32_t baseGroupZ,
448                     uint32_t groupCountX,
449                     uint32_t groupCountY,
450                     uint32_t groupCountZ)
451 {
452    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
453    struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
454 
455    desc->root.cs.base_group[0] = baseGroupX;
456    desc->root.cs.base_group[1] = baseGroupY;
457    desc->root.cs.base_group[2] = baseGroupZ;
458    desc->root.cs.group_count[0] = groupCountX;
459    desc->root.cs.group_count[1] = groupCountY;
460    desc->root.cs.group_count[2] = groupCountZ;
461 
462    uint64_t qmd_addr = nvk_flush_compute_state(cmd, NULL);
463    if (unlikely(qmd_addr == 0))
464       return;
465 
466    const uint32_t local_size = nvk_compute_local_size(cmd);
467    const uint64_t cs_invocations =
468       (uint64_t)local_size * (uint64_t)groupCountX *
469       (uint64_t)groupCountY * (uint64_t)groupCountZ;
470 
471    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
472 
473    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
474    P_INLINE_DATA(p, cs_invocations >> 32);
475    P_INLINE_DATA(p, cs_invocations);
476 
477    P_MTHD(p, NVA0C0, SEND_PCAS_A);
478    P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
479 
480    if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
481       P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
482             .invalidate = INVALIDATE_TRUE,
483             .schedule = SCHEDULE_TRUE
484       });
485    } else {
486       P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
487              PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
488    }
489 }
490 
491 static void
mme_store_global(struct mme_builder * b,struct mme_value64 addr,uint64_t offset,struct mme_value v)492 mme_store_global(struct mme_builder *b,
493                  struct mme_value64 addr,
494                  uint64_t offset,
495                  struct mme_value v)
496 {
497    if (offset > 0)
498       addr = mme_add64(b, addr, mme_imm64(offset));
499 
500    mme_mthd(b, NV9097_SET_REPORT_SEMAPHORE_A);
501    mme_emit_addr64(b, addr);
502    mme_emit(b, v);
503    mme_emit(b, mme_imm(0x10000000));
504 
505    if (offset > 0) {
506       mme_free_reg(b, addr.lo);
507       mme_free_reg(b, addr.hi);
508    }
509 }
510 
511 static void
mme_store_global_vec3(struct mme_builder * b,struct mme_value64 addr,uint32_t offset,struct mme_value x,struct mme_value y,struct mme_value z)512 mme_store_global_vec3(struct mme_builder *b,
513                       struct mme_value64 addr,
514                       uint32_t offset,
515                       struct mme_value x,
516                       struct mme_value y,
517                       struct mme_value z)
518 {
519    mme_store_global(b, addr, offset + 0, x);
520    mme_store_global(b, addr, offset + 4, y);
521    mme_store_global(b, addr, offset + 8, z);
522 }
523 
524 void
nvk_mme_dispatch_indirect(struct mme_builder * b)525 nvk_mme_dispatch_indirect(struct mme_builder *b)
526 {
527    if (b->devinfo->cls_eng3d < TURING_A)
528       return;
529 
530    struct mme_value local_size = mme_load(b);
531    struct mme_value64 dispatch_addr = mme_load_addr64(b);
532    struct mme_value64 root_desc_addr = mme_load_addr64(b);
533    struct mme_value64 qmd_addr = mme_load_addr64(b);
534 
535    mme_tu104_read_fifoed(b, dispatch_addr, mme_imm(3));
536 
537    uint32_t qmd_size_offset = qmd_dispatch_size_offset(b->devinfo);
538    uint32_t root_desc_size_offset =
539       offsetof(struct nvk_root_descriptor_table, cs.group_count);
540 
541    struct mme_value group_count_x = mme_load(b);
542    struct mme_value group_count_y = mme_load(b);
543    struct mme_value group_count_z = mme_load(b);
544 
545    struct mme_value64 cs1 = mme_umul_32x32_64(b, local_size, group_count_x);
546    struct mme_value64 cs2 = mme_umul_32x32_64(b, group_count_y, group_count_z);
547    nvk_build_mme_add_cs_invocations(b, mme_mul64(b, cs1, cs2));
548 
549    mme_store_global_vec3(b, qmd_addr, qmd_size_offset,
550                          group_count_x, group_count_y, group_count_z);
551    mme_store_global_vec3(b, root_desc_addr, root_desc_size_offset,
552                          group_count_x, group_count_y, group_count_z);
553 }
554 
555 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)556 nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
557                         VkBuffer _buffer,
558                         VkDeviceSize offset)
559 {
560    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
561    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
562    struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
563 
564    /* TODO: Indirect dispatch pre-Turing */
565    assert(nvk_cmd_buffer_device(cmd)->pdev->info.cls_eng3d >= TURING_A);
566 
567    desc->root.cs.base_group[0] = 0;
568    desc->root.cs.base_group[1] = 0;
569    desc->root.cs.base_group[2] = 0;
570 
571    uint64_t dispatch_addr = nvk_buffer_address(buffer, offset);
572 
573    uint64_t root_desc_addr;
574    uint64_t qmd_addr = nvk_flush_compute_state(cmd, &root_desc_addr);
575    if (unlikely(qmd_addr == 0))
576       return;
577 
578    struct nv_push *p = nvk_cmd_buffer_push(cmd, 14);
579 
580    P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
581    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
582    P_INLINE_DATA(p, nvk_compute_local_size(cmd));
583    P_INLINE_DATA(p, dispatch_addr >> 32);
584    P_INLINE_DATA(p, dispatch_addr);
585    P_INLINE_DATA(p, root_desc_addr >> 32);
586    P_INLINE_DATA(p, root_desc_addr);
587    P_INLINE_DATA(p, qmd_addr >> 32);
588    P_INLINE_DATA(p, qmd_addr);
589 
590    P_MTHD(p, NVA0C0, SEND_PCAS_A);
591    P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
592    if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
593       P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
594             .invalidate = INVALIDATE_TRUE,
595             .schedule = SCHEDULE_TRUE
596       });
597    } else {
598       P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
599              PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
600    }
601 }
602