1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_buffer.h"
6 #include "nvk_cmd_buffer.h"
7 #include "nvk_descriptor_set.h"
8 #include "nvk_device.h"
9 #include "nvk_entrypoints.h"
10 #include "nvk_mme.h"
11 #include "nvk_physical_device.h"
12 #include "nvk_shader.h"
13
14 #include "nouveau_context.h"
15
16 #include "cla0b5.h"
17 #include "cla1c0.h"
18 #include "clc0c0.h"
19 #include "clc5c0.h"
20 #include "nvk_cl90c0.h"
21 #include "nvk_cl9097.h"
22 #include "nvk_cla0c0.h"
23 #include "nvk_clb0c0.h"
24 #include "nvk_clb1c0.h"
25 #include "nvk_clc3c0.h"
26 #include "nvk_clc597.h"
27 #include "nvk_clc6c0.h"
28
29 #include "drf.h"
30 #include "cla0c0qmd.h"
31 #include "clc0c0qmd.h"
32 #include "clc3c0qmd.h"
33 #include "clc6c0qmd.h"
34
35 #define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a)
36 #define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a)
37 #define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a)
38 #define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a)
39 #define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
40 #define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)
41 #define NVC6C0_QMDV03_00_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC6C0, QMDV03_00, ##a)
42 #define NVC6C0_QMDV03_00_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC6C0, QMDV03_00, ##a)
43
44 #define QMD_DEF_SET(qmd, class_id, version_major, version_minor, a...) \
45 NVDEF_MW_SET((qmd), NV##class_id, QMDV##version_major##_##version_minor, ##a)
46 #define QMD_VAL_SET(qmd, class_id, version_major, version_minor, a...) \
47 NVVAL_MW_SET((qmd), NV##class_id, QMDV##version_major##_##version_minor, ##a)
48
49 VkResult
nvk_push_dispatch_state_init(struct nvk_device * dev,struct nv_push * p)50 nvk_push_dispatch_state_init(struct nvk_device *dev, struct nv_push *p)
51 {
52 struct nvk_physical_device *pdev = nvk_device_physical(dev);
53
54 P_MTHD(p, NV90C0, SET_OBJECT);
55 P_NV90C0_SET_OBJECT(p, {
56 .class_id = pdev->info.cls_compute,
57 .engine_id = 0,
58 });
59
60 if (pdev->info.cls_compute == MAXWELL_COMPUTE_A)
61 P_IMMD(p, NVB0C0, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
62
63 if (pdev->info.cls_eng3d < VOLTA_COMPUTE_A) {
64 uint64_t shader_base_addr =
65 nvk_heap_contiguous_base_address(&dev->shader_heap);
66
67 P_MTHD(p, NVA0C0, SET_PROGRAM_REGION_A);
68 P_NVA0C0_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
69 P_NVA0C0_SET_PROGRAM_REGION_B(p, shader_base_addr);
70 }
71
72 return VK_SUCCESS;
73 }
74
75 static inline uint16_t
nvk_cmd_buffer_compute_cls(struct nvk_cmd_buffer * cmd)76 nvk_cmd_buffer_compute_cls(struct nvk_cmd_buffer *cmd)
77 {
78 return nvk_cmd_buffer_device(cmd)->pdev->info.cls_compute;
79 }
80
81 void
nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)82 nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd,
83 const VkCommandBufferBeginInfo *pBeginInfo)
84 {
85 if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
86 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
87 if (nvk_cmd_buffer_compute_cls(cmd) >= MAXWELL_COMPUTE_B) {
88 P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
89 }
90 P_IMMD(p, NVA0C0, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
91 .lines = LINES_ALL,
92 });
93 P_IMMD(p, NVA0C0, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
94 .lines = LINES_ALL,
95 });
96 }
97 }
98
99 void
nvk_cmd_invalidate_compute_state(struct nvk_cmd_buffer * cmd)100 nvk_cmd_invalidate_compute_state(struct nvk_cmd_buffer *cmd)
101 {
102 memset(&cmd->state.cs, 0, sizeof(cmd->state.cs));
103 }
104
105 static int
gv100_sm_config_smem_size(uint32_t size)106 gv100_sm_config_smem_size(uint32_t size)
107 {
108 if (size > 64 * 1024) size = 96 * 1024;
109 else if (size > 32 * 1024) size = 64 * 1024;
110 else if (size > 16 * 1024) size = 32 * 1024;
111 else if (size > 8 * 1024) size = 16 * 1024;
112 else size = 8 * 1024;
113 return (size / 4096) + 1;
114 }
115
116 #define nvk_qmd_init_base(qmd, shader, class_id, version_major, version_minor) \
117 do { \
118 QMD_DEF_SET(qmd, class_id, version_major, version_minor, API_VISIBLE_CALL_LIMIT, NO_CHECK); \
119 QMD_VAL_SET(qmd, class_id, version_major, version_minor, BARRIER_COUNT, shader->info.num_barriers); \
120 QMD_VAL_SET(qmd, class_id, version_major, version_minor, CTA_THREAD_DIMENSION0, \
121 shader->info.cs.local_size[0]); \
122 QMD_VAL_SET(qmd, class_id, version_major, version_minor, CTA_THREAD_DIMENSION1, \
123 shader->info.cs.local_size[1]); \
124 QMD_VAL_SET(qmd, class_id, version_major, version_minor, CTA_THREAD_DIMENSION2, \
125 shader->info.cs.local_size[2]); \
126 QMD_VAL_SET(qmd, class_id, version_major, version_minor, QMD_MAJOR_VERSION, version_major); \
127 QMD_VAL_SET(qmd, class_id, version_major, version_minor, QMD_VERSION, version_minor); \
128 QMD_DEF_SET(qmd, class_id, version_major, version_minor, SAMPLER_INDEX, INDEPENDENTLY); \
129 QMD_VAL_SET(qmd, class_id, version_major, version_minor, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); \
130 QMD_VAL_SET(qmd, class_id, version_major, version_minor, SHADER_LOCAL_MEMORY_LOW_SIZE, \
131 align(shader->info.slm_size, 0x10)); \
132 QMD_VAL_SET(qmd, class_id, version_major, version_minor, SHARED_MEMORY_SIZE, \
133 align(shader->info.cs.smem_size, 0x100)); \
134 } while (0)
135
136 static void
nva0c0_qmd_init(uint32_t * qmd,const struct nvk_shader * shader)137 nva0c0_qmd_init(uint32_t *qmd, const struct nvk_shader *shader)
138 {
139 nvk_qmd_init_base(qmd, shader, A0C0, 00, 06);
140
141 if (shader->info.cs.smem_size <= (16 << 10))
142 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB);
143 else if (shader->info.cs.smem_size <= (32 << 10))
144 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB);
145 else if (shader->info.cs.smem_size <= (48 << 10))
146 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
147 else
148 unreachable("Invalid shared memory size");
149
150 uint64_t addr = shader->hdr_addr;
151 assert(addr < 0xffffffff);
152 NVA0C0_QMDV00_06_VAL_SET(qmd, PROGRAM_OFFSET, addr);
153 NVA0C0_QMDV00_06_VAL_SET(qmd, REGISTER_COUNT, shader->info.num_gprs);
154 NVA0C0_QMDV00_06_VAL_SET(qmd, SASS_VERSION, 0x30);
155 }
156
157 static void
nvc0c0_qmd_init(uint32_t * qmd,const struct nvk_shader * shader)158 nvc0c0_qmd_init(uint32_t *qmd, const struct nvk_shader *shader)
159 {
160 nvk_qmd_init_base(qmd, shader, C0C0, 02, 01);
161
162 uint64_t addr = shader->hdr_addr;
163 assert(addr < 0xffffffff);
164
165 NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
166 NVC0C0_QMDV02_01_VAL_SET(qmd, PROGRAM_OFFSET, addr);
167 NVC0C0_QMDV02_01_VAL_SET(qmd, REGISTER_COUNT, shader->info.num_gprs);
168 }
169
170 static void
nvc3c0_qmd_init(uint32_t * qmd,const struct nvk_shader * shader)171 nvc3c0_qmd_init(uint32_t *qmd, const struct nvk_shader *shader)
172 {
173 nvk_qmd_init_base(qmd, shader, C3C0, 02, 02);
174
175 NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
176 /* those are all QMD 2.2+ */
177 NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
178 gv100_sm_config_smem_size(shader->info.cs.smem_size));
179 NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
180 gv100_sm_config_smem_size(NVK_MAX_SHARED_SIZE));
181 NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
182 gv100_sm_config_smem_size(shader->info.cs.smem_size));
183
184 NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, shader->info.num_gprs);
185
186 uint64_t addr = shader->hdr_addr;
187 NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, addr & 0xffffffff);
188 NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, addr >> 32);
189 }
190
191 static void
nvc6c0_qmd_init(uint32_t * qmd,const struct nvk_shader * shader)192 nvc6c0_qmd_init(uint32_t *qmd, const struct nvk_shader *shader)
193 {
194 nvk_qmd_init_base(qmd, shader, C6C0, 03, 00);
195
196 NVC6C0_QMDV03_00_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
197 /* those are all QMD 2.2+ */
198 NVC6C0_QMDV03_00_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
199 gv100_sm_config_smem_size(shader->info.cs.smem_size));
200 NVC6C0_QMDV03_00_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
201 gv100_sm_config_smem_size(NVK_MAX_SHARED_SIZE));
202 NVC6C0_QMDV03_00_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
203 gv100_sm_config_smem_size(shader->info.cs.smem_size));
204
205 NVC6C0_QMDV03_00_VAL_SET(qmd, REGISTER_COUNT_V, shader->info.num_gprs);
206
207 uint64_t addr = shader->hdr_addr;
208 NVC6C0_QMDV03_00_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, addr & 0xffffffff);
209 NVC6C0_QMDV03_00_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, addr >> 32);
210 }
211
212 static void
nvk_qmd_init(struct nvk_physical_device * pdev,uint32_t * qmd,const struct nvk_shader * shader)213 nvk_qmd_init(struct nvk_physical_device *pdev,
214 uint32_t *qmd, const struct nvk_shader *shader)
215 {
216 if (pdev->info.cls_compute >= AMPERE_COMPUTE_A)
217 nvc6c0_qmd_init(qmd, shader);
218 else if (pdev->info.cls_compute >= VOLTA_COMPUTE_A)
219 nvc3c0_qmd_init(qmd, shader);
220 else if (pdev->info.cls_compute >= PASCAL_COMPUTE_A)
221 nvc0c0_qmd_init(qmd, shader);
222 else if (pdev->info.cls_compute >= KEPLER_COMPUTE_A)
223 nva0c0_qmd_init(qmd, shader);
224 else
225 unreachable("Unknown GPU generation");
226 }
227
228 static void
nva0c0_qmd_set_dispatch_size(UNUSED struct nvk_device * dev,uint32_t * qmd,uint32_t x,uint32_t y,uint32_t z)229 nva0c0_qmd_set_dispatch_size(UNUSED struct nvk_device *dev, uint32_t *qmd,
230 uint32_t x, uint32_t y, uint32_t z)
231 {
232 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_WIDTH, x);
233 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_HEIGHT, y);
234 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_DEPTH, z);
235 }
236
237 static void
nvc0c0_qmd_set_dispatch_size(UNUSED struct nvk_device * dev,uint32_t * qmd,uint32_t x,uint32_t y,uint32_t z)238 nvc0c0_qmd_set_dispatch_size(UNUSED struct nvk_device *dev, uint32_t *qmd,
239 uint32_t x, uint32_t y, uint32_t z)
240 {
241 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_WIDTH, x);
242 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_HEIGHT, y);
243 /* this field is different from older QMD versions */
244 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_DEPTH, z);
245 }
246
247 static void
nvc6c0_qmd_set_dispatch_size(UNUSED struct nvk_device * dev,uint32_t * qmd,uint32_t x,uint32_t y,uint32_t z)248 nvc6c0_qmd_set_dispatch_size(UNUSED struct nvk_device *dev, uint32_t *qmd,
249 uint32_t x, uint32_t y, uint32_t z)
250 {
251 NVC6C0_QMDV03_00_VAL_SET(qmd, CTA_RASTER_WIDTH, x);
252 NVC6C0_QMDV03_00_VAL_SET(qmd, CTA_RASTER_HEIGHT, y);
253 /* this field is different from older QMD versions */
254 NVC6C0_QMDV03_00_VAL_SET(qmd, CTA_RASTER_DEPTH, z);
255 }
256
257 static uint32_t
qmd_dispatch_size_offset(const struct nv_device_info * devinfo)258 qmd_dispatch_size_offset(const struct nv_device_info *devinfo)
259 {
260 assert(devinfo->cls_compute >= VOLTA_COMPUTE_A);
261 uint32_t bit = DRF_LO(DRF_MW(NVC3C0_QMDV02_02_CTA_RASTER_WIDTH));
262 assert(bit % 32 == 0);
263 assert(DRF_LO(DRF_MW(NVC3C0_QMDV02_02_CTA_RASTER_HEIGHT)) == bit + 32);
264 assert(DRF_LO(DRF_MW(NVC3C0_QMDV02_02_CTA_RASTER_DEPTH)) == bit + 64);
265 return bit / 8;
266 }
267
268 static inline void
nva0c0_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,uint32_t size,uint64_t address)269 nva0c0_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
270 uint32_t size, uint64_t address)
271 {
272 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
273 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
274 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_SIZE, index, size);
275 NVA0C0_QMDV00_06_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
276 }
277
278 static inline void
nvc0c0_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,uint32_t size,uint64_t address)279 nvc0c0_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
280 uint32_t size, uint64_t address)
281 {
282 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
283 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
284 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index,
285 DIV_ROUND_UP(size, 16));
286 NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
287 }
288
289 static inline void
nvc6c0_cp_launch_desc_set_cb(uint32_t * qmd,unsigned index,uint32_t size,uint64_t address)290 nvc6c0_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
291 uint32_t size, uint64_t address)
292 {
293 NVC6C0_QMDV03_00_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
294 NVC6C0_QMDV03_00_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
295 NVC6C0_QMDV03_00_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index,
296 DIV_ROUND_UP(size, 16));
297 NVC6C0_QMDV03_00_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
298 }
299
300
301 void
nvk_cmd_bind_compute_shader(struct nvk_cmd_buffer * cmd,struct nvk_shader * shader)302 nvk_cmd_bind_compute_shader(struct nvk_cmd_buffer *cmd,
303 struct nvk_shader *shader)
304 {
305 cmd->state.cs.shader = shader;
306 }
307
308 static uint32_t
nvk_compute_local_size(struct nvk_cmd_buffer * cmd)309 nvk_compute_local_size(struct nvk_cmd_buffer *cmd)
310 {
311 const struct nvk_shader *shader = cmd->state.cs.shader;
312
313 return shader->info.cs.local_size[0] *
314 shader->info.cs.local_size[1] *
315 shader->info.cs.local_size[2];
316 }
317
318 static uint64_t
nvk_flush_compute_state(struct nvk_cmd_buffer * cmd,uint64_t * root_desc_addr_out)319 nvk_flush_compute_state(struct nvk_cmd_buffer *cmd,
320 uint64_t *root_desc_addr_out)
321 {
322 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
323 struct nvk_physical_device *pdev = nvk_device_physical(dev);
324 const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
325 const struct nvk_shader *shader = cmd->state.cs.shader;
326 struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
327 VkResult result;
328
329 nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
330
331 /* pre Pascal the constant buffer sizes need to be 0x100 aligned. As we
332 * simply allocated a buffer and upload data to it, make sure its size is
333 * 0x100 aligned.
334 */
335 STATIC_ASSERT((sizeof(desc->root) & 0xff) == 0);
336 assert(sizeof(desc->root) % min_cbuf_alignment == 0);
337
338 void *root_desc_map;
339 uint64_t root_desc_addr;
340 result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(desc->root),
341 min_cbuf_alignment,
342 &root_desc_addr, &root_desc_map);
343 if (unlikely(result != VK_SUCCESS)) {
344 vk_command_buffer_set_error(&cmd->vk, result);
345 return 0;
346 }
347
348 desc->root.root_desc_addr = root_desc_addr;
349 memcpy(root_desc_map, &desc->root, sizeof(desc->root));
350
351 uint32_t qmd[128];
352 memset(qmd, 0, sizeof(qmd));
353 nvk_qmd_init(pdev, qmd, shader);
354
355 if (nvk_cmd_buffer_compute_cls(cmd) >= AMPERE_COMPUTE_A) {
356 nvc6c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd,
357 desc->root.cs.group_count[0],
358 desc->root.cs.group_count[1],
359 desc->root.cs.group_count[2]);
360 } else if (nvk_cmd_buffer_compute_cls(cmd) >= PASCAL_COMPUTE_A) {
361 nvc0c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd,
362 desc->root.cs.group_count[0],
363 desc->root.cs.group_count[1],
364 desc->root.cs.group_count[2]);
365 } else {
366 assert(nvk_cmd_buffer_compute_cls(cmd) >= KEPLER_COMPUTE_A);
367 nva0c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd,
368 desc->root.cs.group_count[0],
369 desc->root.cs.group_count[1],
370 desc->root.cs.group_count[2]);
371 }
372
373 for (uint32_t c = 0; c < shader->cbuf_map.cbuf_count; c++) {
374 const struct nvk_cbuf *cbuf = &shader->cbuf_map.cbufs[c];
375
376 struct nvk_buffer_address ba;
377 if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) {
378 ba = (struct nvk_buffer_address) {
379 .base_addr = root_desc_addr,
380 .size = sizeof(desc->root),
381 };
382 } else {
383 ASSERTED bool direct_descriptor =
384 nvk_cmd_buffer_get_cbuf_descriptor(cmd, desc, shader, cbuf, &ba);
385 assert(direct_descriptor);
386 }
387
388 if (ba.size > 0) {
389 assert(ba.base_addr % min_cbuf_alignment == 0);
390 ba.size = align(ba.size, min_cbuf_alignment);
391 ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
392
393 if (nvk_cmd_buffer_compute_cls(cmd) >= AMPERE_COMPUTE_A) {
394 nvc6c0_cp_launch_desc_set_cb(qmd, c, ba.size, ba.base_addr);
395 } else if (nvk_cmd_buffer_compute_cls(cmd) >= PASCAL_COMPUTE_A) {
396 nvc0c0_cp_launch_desc_set_cb(qmd, c, ba.size, ba.base_addr);
397 } else {
398 assert(nvk_cmd_buffer_compute_cls(cmd) >= KEPLER_COMPUTE_A);
399 nva0c0_cp_launch_desc_set_cb(qmd, c, ba.size, ba.base_addr);
400 }
401 }
402 }
403
404 uint64_t qmd_addr;
405 result = nvk_cmd_buffer_upload_data(cmd, qmd, sizeof(qmd), 256, &qmd_addr);
406 if (unlikely(result != VK_SUCCESS)) {
407 vk_command_buffer_set_error(&cmd->vk, result);
408 return 0;
409 }
410
411 if (root_desc_addr_out != NULL)
412 *root_desc_addr_out = root_desc_addr;
413
414 return qmd_addr;
415 }
416
417 static void
nvk_build_mme_add_cs_invocations(struct mme_builder * b,struct mme_value64 count)418 nvk_build_mme_add_cs_invocations(struct mme_builder *b,
419 struct mme_value64 count)
420 {
421 struct mme_value accum_hi = nvk_mme_load_scratch(b, CS_INVOCATIONS_HI);
422 struct mme_value accum_lo = nvk_mme_load_scratch(b, CS_INVOCATIONS_LO);
423 struct mme_value64 accum = mme_value64(accum_lo, accum_hi);
424
425 accum = mme_add64(b, accum, count);
426
427 STATIC_ASSERT(NVK_MME_SCRATCH_CS_INVOCATIONS_HI + 1 ==
428 NVK_MME_SCRATCH_CS_INVOCATIONS_LO);
429
430 mme_mthd(b, NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI));
431 mme_emit(b, accum.hi);
432 mme_emit(b, accum.lo);
433 }
434
435 void
nvk_mme_add_cs_invocations(struct mme_builder * b)436 nvk_mme_add_cs_invocations(struct mme_builder *b)
437 {
438 struct mme_value64 count = mme_load_addr64(b);
439
440 nvk_build_mme_add_cs_invocations(b, count);
441 }
442
443 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)444 nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,
445 uint32_t baseGroupX,
446 uint32_t baseGroupY,
447 uint32_t baseGroupZ,
448 uint32_t groupCountX,
449 uint32_t groupCountY,
450 uint32_t groupCountZ)
451 {
452 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
453 struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
454
455 desc->root.cs.base_group[0] = baseGroupX;
456 desc->root.cs.base_group[1] = baseGroupY;
457 desc->root.cs.base_group[2] = baseGroupZ;
458 desc->root.cs.group_count[0] = groupCountX;
459 desc->root.cs.group_count[1] = groupCountY;
460 desc->root.cs.group_count[2] = groupCountZ;
461
462 uint64_t qmd_addr = nvk_flush_compute_state(cmd, NULL);
463 if (unlikely(qmd_addr == 0))
464 return;
465
466 const uint32_t local_size = nvk_compute_local_size(cmd);
467 const uint64_t cs_invocations =
468 (uint64_t)local_size * (uint64_t)groupCountX *
469 (uint64_t)groupCountY * (uint64_t)groupCountZ;
470
471 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
472
473 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
474 P_INLINE_DATA(p, cs_invocations >> 32);
475 P_INLINE_DATA(p, cs_invocations);
476
477 P_MTHD(p, NVA0C0, SEND_PCAS_A);
478 P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
479
480 if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
481 P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
482 .invalidate = INVALIDATE_TRUE,
483 .schedule = SCHEDULE_TRUE
484 });
485 } else {
486 P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
487 PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
488 }
489 }
490
491 static void
mme_store_global(struct mme_builder * b,struct mme_value64 addr,uint64_t offset,struct mme_value v)492 mme_store_global(struct mme_builder *b,
493 struct mme_value64 addr,
494 uint64_t offset,
495 struct mme_value v)
496 {
497 if (offset > 0)
498 addr = mme_add64(b, addr, mme_imm64(offset));
499
500 mme_mthd(b, NV9097_SET_REPORT_SEMAPHORE_A);
501 mme_emit_addr64(b, addr);
502 mme_emit(b, v);
503 mme_emit(b, mme_imm(0x10000000));
504
505 if (offset > 0) {
506 mme_free_reg(b, addr.lo);
507 mme_free_reg(b, addr.hi);
508 }
509 }
510
511 static void
mme_store_global_vec3(struct mme_builder * b,struct mme_value64 addr,uint32_t offset,struct mme_value x,struct mme_value y,struct mme_value z)512 mme_store_global_vec3(struct mme_builder *b,
513 struct mme_value64 addr,
514 uint32_t offset,
515 struct mme_value x,
516 struct mme_value y,
517 struct mme_value z)
518 {
519 mme_store_global(b, addr, offset + 0, x);
520 mme_store_global(b, addr, offset + 4, y);
521 mme_store_global(b, addr, offset + 8, z);
522 }
523
524 void
nvk_mme_dispatch_indirect(struct mme_builder * b)525 nvk_mme_dispatch_indirect(struct mme_builder *b)
526 {
527 if (b->devinfo->cls_eng3d < TURING_A)
528 return;
529
530 struct mme_value local_size = mme_load(b);
531 struct mme_value64 dispatch_addr = mme_load_addr64(b);
532 struct mme_value64 root_desc_addr = mme_load_addr64(b);
533 struct mme_value64 qmd_addr = mme_load_addr64(b);
534
535 mme_tu104_read_fifoed(b, dispatch_addr, mme_imm(3));
536
537 uint32_t qmd_size_offset = qmd_dispatch_size_offset(b->devinfo);
538 uint32_t root_desc_size_offset =
539 offsetof(struct nvk_root_descriptor_table, cs.group_count);
540
541 struct mme_value group_count_x = mme_load(b);
542 struct mme_value group_count_y = mme_load(b);
543 struct mme_value group_count_z = mme_load(b);
544
545 struct mme_value64 cs1 = mme_umul_32x32_64(b, local_size, group_count_x);
546 struct mme_value64 cs2 = mme_umul_32x32_64(b, group_count_y, group_count_z);
547 nvk_build_mme_add_cs_invocations(b, mme_mul64(b, cs1, cs2));
548
549 mme_store_global_vec3(b, qmd_addr, qmd_size_offset,
550 group_count_x, group_count_y, group_count_z);
551 mme_store_global_vec3(b, root_desc_addr, root_desc_size_offset,
552 group_count_x, group_count_y, group_count_z);
553 }
554
555 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)556 nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
557 VkBuffer _buffer,
558 VkDeviceSize offset)
559 {
560 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
561 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
562 struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
563
564 /* TODO: Indirect dispatch pre-Turing */
565 assert(nvk_cmd_buffer_device(cmd)->pdev->info.cls_eng3d >= TURING_A);
566
567 desc->root.cs.base_group[0] = 0;
568 desc->root.cs.base_group[1] = 0;
569 desc->root.cs.base_group[2] = 0;
570
571 uint64_t dispatch_addr = nvk_buffer_address(buffer, offset);
572
573 uint64_t root_desc_addr;
574 uint64_t qmd_addr = nvk_flush_compute_state(cmd, &root_desc_addr);
575 if (unlikely(qmd_addr == 0))
576 return;
577
578 struct nv_push *p = nvk_cmd_buffer_push(cmd, 14);
579
580 P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
581 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
582 P_INLINE_DATA(p, nvk_compute_local_size(cmd));
583 P_INLINE_DATA(p, dispatch_addr >> 32);
584 P_INLINE_DATA(p, dispatch_addr);
585 P_INLINE_DATA(p, root_desc_addr >> 32);
586 P_INLINE_DATA(p, root_desc_addr);
587 P_INLINE_DATA(p, qmd_addr >> 32);
588 P_INLINE_DATA(p, qmd_addr);
589
590 P_MTHD(p, NVA0C0, SEND_PCAS_A);
591 P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
592 if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
593 P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
594 .invalidate = INVALIDATE_TRUE,
595 .schedule = SCHEDULE_TRUE
596 });
597 } else {
598 P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
599 PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
600 }
601 }
602