1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_queue.h"
6
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_physical_device.h"
10 #include "nv_push.h"
11
12 #include "nouveau_context.h"
13
14 #include <xf86drm.h>
15
16 #include "nvk_cl9039.h"
17 #include "nvk_cl9097.h"
18 #include "nvk_cl90b5.h"
19 #include "nvk_cla0c0.h"
20 #include "cla1c0.h"
21 #include "nvk_clc3c0.h"
22 #include "nvk_clc397.h"
23
24 static void
nvk_queue_state_init(struct nvk_queue_state * qs)25 nvk_queue_state_init(struct nvk_queue_state *qs)
26 {
27 memset(qs, 0, sizeof(*qs));
28 }
29
30 static void
nvk_queue_state_finish(struct nvk_device * dev,struct nvk_queue_state * qs)31 nvk_queue_state_finish(struct nvk_device *dev,
32 struct nvk_queue_state *qs)
33 {
34 if (qs->images.bo)
35 nouveau_ws_bo_destroy(qs->images.bo);
36 if (qs->samplers.bo)
37 nouveau_ws_bo_destroy(qs->samplers.bo);
38 if (qs->slm.bo)
39 nouveau_ws_bo_destroy(qs->slm.bo);
40 if (qs->push.bo) {
41 nouveau_ws_bo_unmap(qs->push.bo, qs->push.bo_map);
42 nouveau_ws_bo_destroy(qs->push.bo);
43 }
44 }
45
46 static void
nvk_queue_state_dump_push(struct nvk_device * dev,struct nvk_queue_state * qs,FILE * fp)47 nvk_queue_state_dump_push(struct nvk_device *dev,
48 struct nvk_queue_state *qs, FILE *fp)
49 {
50 struct nv_push push = {
51 .start = (uint32_t *)qs->push.bo_map,
52 .end = (uint32_t *)qs->push.bo_map + qs->push.dw_count,
53 };
54 vk_push_print(fp, &push, &dev->pdev->info);
55 }
56
57 VkResult
nvk_queue_state_update(struct nvk_device * dev,struct nvk_queue_state * qs)58 nvk_queue_state_update(struct nvk_device *dev,
59 struct nvk_queue_state *qs)
60 {
61 struct nouveau_ws_bo *bo;
62 uint32_t alloc_count, bytes_per_warp, bytes_per_tpc;
63 bool dirty = false;
64
65 bo = nvk_descriptor_table_get_bo_ref(&dev->images, &alloc_count);
66 if (qs->images.bo != bo || qs->images.alloc_count != alloc_count) {
67 if (qs->images.bo)
68 nouveau_ws_bo_destroy(qs->images.bo);
69 qs->images.bo = bo;
70 qs->images.alloc_count = alloc_count;
71 dirty = true;
72 } else {
73 /* No change */
74 if (bo)
75 nouveau_ws_bo_destroy(bo);
76 }
77
78 bo = nvk_descriptor_table_get_bo_ref(&dev->samplers, &alloc_count);
79 if (qs->samplers.bo != bo || qs->samplers.alloc_count != alloc_count) {
80 if (qs->samplers.bo)
81 nouveau_ws_bo_destroy(qs->samplers.bo);
82 qs->samplers.bo = bo;
83 qs->samplers.alloc_count = alloc_count;
84 dirty = true;
85 } else {
86 /* No change */
87 if (bo)
88 nouveau_ws_bo_destroy(bo);
89 }
90
91 bo = nvk_slm_area_get_bo_ref(&dev->slm, &bytes_per_warp, &bytes_per_tpc);
92 if (qs->slm.bo != bo || qs->slm.bytes_per_warp != bytes_per_warp ||
93 qs->slm.bytes_per_tpc != bytes_per_tpc) {
94 if (qs->slm.bo)
95 nouveau_ws_bo_destroy(qs->slm.bo);
96 qs->slm.bo = bo;
97 qs->slm.bytes_per_warp = bytes_per_warp;
98 qs->slm.bytes_per_tpc = bytes_per_tpc;
99 dirty = true;
100 } else {
101 /* No change */
102 if (bo)
103 nouveau_ws_bo_destroy(bo);
104 }
105
106 /* TODO: We're currently depending on kernel reference counting to protect
107 * us here. If we ever stop reference counting in the kernel, we will
108 * either need to delay destruction or hold on to our extra BO references
109 * and insert a GPU stall here if anything has changed before dropping our
110 * old references.
111 */
112
113 if (!dirty)
114 return VK_SUCCESS;
115
116 struct nouveau_ws_bo *push_bo;
117 void *push_map;
118 push_bo = nouveau_ws_bo_new_mapped(dev->ws_dev, 256 * 4, 0,
119 NOUVEAU_WS_BO_GART |
120 NOUVEAU_WS_BO_MAP |
121 NOUVEAU_WS_BO_NO_SHARE,
122 NOUVEAU_WS_BO_WR, &push_map);
123 if (push_bo == NULL)
124 return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
125
126 struct nv_push push;
127 nv_push_init(&push, push_map, 256);
128 struct nv_push *p = &push;
129
130 if (qs->images.bo) {
131 /* Compute */
132 P_MTHD(p, NVA0C0, SET_TEX_HEADER_POOL_A);
133 P_NVA0C0_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
134 P_NVA0C0_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
135 P_NVA0C0_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
136 P_IMMD(p, NVA0C0, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
137 .lines = LINES_ALL
138 });
139
140 /* 3D */
141 P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A);
142 P_NV9097_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
143 P_NV9097_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
144 P_NV9097_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
145 P_IMMD(p, NV9097, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
146 .lines = LINES_ALL
147 });
148 }
149
150 if (qs->samplers.bo) {
151 /* Compute */
152 P_MTHD(p, NVA0C0, SET_TEX_SAMPLER_POOL_A);
153 P_NVA0C0_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
154 P_NVA0C0_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
155 P_NVA0C0_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
156 P_IMMD(p, NVA0C0, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
157 .lines = LINES_ALL
158 });
159
160 /* 3D */
161 P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A);
162 P_NV9097_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
163 P_NV9097_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
164 P_NV9097_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
165 P_IMMD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
166 .lines = LINES_ALL
167 });
168 }
169
170 if (qs->slm.bo) {
171 const uint64_t slm_addr = qs->slm.bo->offset;
172 const uint64_t slm_size = qs->slm.bo->size;
173 const uint64_t slm_per_warp = qs->slm.bytes_per_warp;
174 const uint64_t slm_per_tpc = qs->slm.bytes_per_tpc;
175 assert(!(slm_per_tpc & 0x7fff));
176
177 /* Compute */
178 P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
179 P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
180 P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
181
182 P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
183 P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(p, slm_per_tpc >> 32);
184 P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(p, slm_per_tpc);
185 P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(p, 0xff);
186
187 if (dev->pdev->info.cls_compute < VOLTA_COMPUTE_A) {
188 P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
189 P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(p, slm_per_tpc >> 32);
190 P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(p, slm_per_tpc);
191 P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(p, 0xff);
192 }
193
194 /* 3D */
195 P_MTHD(p, NV9097, SET_SHADER_LOCAL_MEMORY_A);
196 P_NV9097_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
197 P_NV9097_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
198 P_NV9097_SET_SHADER_LOCAL_MEMORY_C(p, slm_size >> 32);
199 P_NV9097_SET_SHADER_LOCAL_MEMORY_D(p, slm_size);
200 P_NV9097_SET_SHADER_LOCAL_MEMORY_E(p, slm_per_warp);
201 }
202
203 /* We set memory windows unconditionally. Otherwise, the memory window
204 * might be in a random place and cause us to fault off into nowhere.
205 */
206 if (dev->pdev->info.cls_compute >= VOLTA_COMPUTE_A) {
207 uint64_t temp = 0xfeULL << 24;
208 P_MTHD(p, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
209 P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(p, temp >> 32);
210 P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(p, temp & 0xffffffff);
211
212 temp = 0xffULL << 24;
213 P_MTHD(p, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
214 P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(p, temp >> 32);
215 P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(p, temp & 0xffffffff);
216 } else {
217 P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
218 P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(p, 0xff << 24);
219
220 P_MTHD(p, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
221 P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(p, 0xfe << 24);
222 }
223
224 /* From nvc0_screen.c:
225 *
226 * "Reduce likelihood of collision with real buffers by placing the
227 * hole at the top of the 4G area. This will have to be dealt with
228 * for real eventually by blocking off that area from the VM."
229 *
230 * Really?!? TODO: Fix this for realz. Annoyingly, we only have a
231 * 32-bit pointer for this in 3D rather than a full 48 like we have for
232 * compute.
233 */
234 P_IMMD(p, NV9097, SET_SHADER_LOCAL_MEMORY_WINDOW, 0xff << 24);
235
236 if (qs->push.bo) {
237 nouveau_ws_bo_unmap(qs->push.bo, qs->push.bo_map);
238 nouveau_ws_bo_destroy(qs->push.bo);
239 }
240
241 qs->push.bo = push_bo;
242 qs->push.bo_map = push_map;
243 qs->push.dw_count = nv_push_dw_count(&push);
244
245 return VK_SUCCESS;
246 }
247
248 static VkResult
nvk_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)249 nvk_queue_submit(struct vk_queue *vk_queue,
250 struct vk_queue_submit *submit)
251 {
252 struct nvk_queue *queue = container_of(vk_queue, struct nvk_queue, vk);
253 struct nvk_device *dev = nvk_queue_device(queue);
254 VkResult result;
255
256 if (vk_queue_is_lost(&queue->vk))
257 return VK_ERROR_DEVICE_LOST;
258
259 result = nvk_queue_state_update(dev, &queue->state);
260 if (result != VK_SUCCESS) {
261 return vk_queue_set_lost(&queue->vk, "Failed to update queue base "
262 "pointers pushbuf");
263 }
264
265 const bool sync = dev->ws_dev->debug_flags & NVK_DEBUG_PUSH_SYNC;
266
267 result = nvk_queue_submit_drm_nouveau(queue, submit, sync);
268
269 if ((sync && result != VK_SUCCESS) ||
270 (dev->ws_dev->debug_flags & NVK_DEBUG_PUSH_DUMP)) {
271 nvk_queue_state_dump_push(dev, &queue->state, stderr);
272
273 for (unsigned i = 0; i < submit->command_buffer_count; i++) {
274 struct nvk_cmd_buffer *cmd =
275 container_of(submit->command_buffers[i], struct nvk_cmd_buffer, vk);
276
277 nvk_cmd_buffer_dump(cmd, stderr);
278 }
279 }
280
281 if (result != VK_SUCCESS)
282 return vk_queue_set_lost(&queue->vk, "Submit failed");
283
284 return VK_SUCCESS;
285 }
286
287 static VkResult
nvk_queue_init_context_state(struct nvk_queue * queue,VkQueueFlags queue_flags)288 nvk_queue_init_context_state(struct nvk_queue *queue,
289 VkQueueFlags queue_flags)
290 {
291 struct nvk_device *dev = nvk_queue_device(queue);
292 struct nvk_physical_device *pdev = nvk_device_physical(dev);
293 VkResult result;
294
295 uint32_t push_data[2048];
296 struct nv_push push;
297 nv_push_init(&push, push_data, ARRAY_SIZE(push_data));
298 struct nv_push *p = &push;
299
300 /* M2MF state */
301 if (pdev->info.cls_m2mf <= FERMI_MEMORY_TO_MEMORY_FORMAT_A) {
302 /* we absolutely do not support Fermi, but if somebody wants to toy
303 * around with it, this is a must
304 */
305 P_MTHD(p, NV9039, SET_OBJECT);
306 P_NV9039_SET_OBJECT(p, {
307 .class_id = dev->pdev->info.cls_m2mf,
308 .engine_id = 0,
309 });
310 }
311
312 if (queue_flags & VK_QUEUE_GRAPHICS_BIT) {
313 result = nvk_push_draw_state_init(dev, p);
314 if (result != VK_SUCCESS)
315 return result;
316 }
317
318 if (queue_flags & VK_QUEUE_COMPUTE_BIT) {
319 result = nvk_push_dispatch_state_init(dev, p);
320 if (result != VK_SUCCESS)
321 return result;
322 }
323
324 return nvk_queue_submit_simple(queue, nv_push_dw_count(&push),
325 push_data, 0, NULL);
326 }
327
328 VkResult
nvk_queue_init(struct nvk_device * dev,struct nvk_queue * queue,const VkDeviceQueueCreateInfo * pCreateInfo,uint32_t index_in_family)329 nvk_queue_init(struct nvk_device *dev, struct nvk_queue *queue,
330 const VkDeviceQueueCreateInfo *pCreateInfo,
331 uint32_t index_in_family)
332 {
333 struct nvk_physical_device *pdev = nvk_device_physical(dev);
334 VkResult result;
335
336 assert(pCreateInfo->queueFamilyIndex < pdev->queue_family_count);
337 const struct nvk_queue_family *queue_family =
338 &pdev->queue_families[pCreateInfo->queueFamilyIndex];
339
340 VkQueueFlags queue_flags = queue_family->queue_flags;
341
342 /* We rely on compute shaders for queries */
343 if (queue_family->queue_flags & VK_QUEUE_GRAPHICS_BIT)
344 queue_flags |= VK_QUEUE_COMPUTE_BIT;
345
346 /* We currently rely on 3D engine MMEs for indirect dispatch */
347 if (queue_family->queue_flags & VK_QUEUE_COMPUTE_BIT)
348 queue_flags |= VK_QUEUE_GRAPHICS_BIT;
349
350 result = vk_queue_init(&queue->vk, &dev->vk, pCreateInfo, index_in_family);
351 if (result != VK_SUCCESS)
352 return result;
353
354 queue->vk.driver_submit = nvk_queue_submit;
355
356 nvk_queue_state_init(&queue->state);
357
358 result = nvk_queue_init_drm_nouveau(dev, queue, queue_flags);
359 if (result != VK_SUCCESS)
360 goto fail_init;
361
362 result = nvk_queue_init_context_state(queue, queue_flags);
363 if (result != VK_SUCCESS)
364 goto fail_drm;
365
366 return VK_SUCCESS;
367
368 fail_drm:
369 nvk_queue_finish_drm_nouveau(dev, queue);
370 fail_init:
371 vk_queue_finish(&queue->vk);
372
373 return result;
374 }
375
376 void
nvk_queue_finish(struct nvk_device * dev,struct nvk_queue * queue)377 nvk_queue_finish(struct nvk_device *dev, struct nvk_queue *queue)
378 {
379 nvk_queue_state_finish(dev, &queue->state);
380 nvk_queue_finish_drm_nouveau(dev, queue);
381 vk_queue_finish(&queue->vk);
382 }
383
384 VkResult
nvk_queue_submit_simple(struct nvk_queue * queue,uint32_t dw_count,const uint32_t * dw,uint32_t extra_bo_count,struct nouveau_ws_bo ** extra_bos)385 nvk_queue_submit_simple(struct nvk_queue *queue,
386 uint32_t dw_count, const uint32_t *dw,
387 uint32_t extra_bo_count,
388 struct nouveau_ws_bo **extra_bos)
389 {
390 struct nvk_device *dev = nvk_queue_device(queue);
391 struct nouveau_ws_bo *push_bo;
392 VkResult result;
393
394 if (vk_queue_is_lost(&queue->vk))
395 return VK_ERROR_DEVICE_LOST;
396
397 void *push_map;
398 push_bo = nouveau_ws_bo_new_mapped(dev->ws_dev, dw_count * 4, 0,
399 NOUVEAU_WS_BO_GART |
400 NOUVEAU_WS_BO_MAP |
401 NOUVEAU_WS_BO_NO_SHARE,
402 NOUVEAU_WS_BO_WR, &push_map);
403 if (push_bo == NULL)
404 return vk_error(queue, VK_ERROR_OUT_OF_DEVICE_MEMORY);
405
406 memcpy(push_map, dw, dw_count * 4);
407
408 result = nvk_queue_submit_simple_drm_nouveau(queue, dw_count, push_bo,
409 extra_bo_count, extra_bos);
410
411 const bool debug_sync = dev->ws_dev->debug_flags & NVK_DEBUG_PUSH_SYNC;
412 if ((debug_sync && result != VK_SUCCESS) ||
413 (dev->ws_dev->debug_flags & NVK_DEBUG_PUSH_DUMP)) {
414 struct nv_push push = {
415 .start = (uint32_t *)dw,
416 .end = (uint32_t *)dw + dw_count,
417 };
418 vk_push_print(stderr, &push, &dev->pdev->info);
419 }
420
421 nouveau_ws_bo_unmap(push_bo, push_map);
422 nouveau_ws_bo_destroy(push_bo);
423
424 if (result != VK_SUCCESS)
425 return vk_queue_set_lost(&queue->vk, "Submit failed");
426
427 return VK_SUCCESS;
428 }
429