• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_queue.h"
6 
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_physical_device.h"
10 #include "nv_push.h"
11 
12 #include "nouveau_context.h"
13 
14 #include <xf86drm.h>
15 
16 #include "nvk_cl9039.h"
17 #include "nvk_cl9097.h"
18 #include "nvk_cl90b5.h"
19 #include "nvk_cla0c0.h"
20 #include "cla1c0.h"
21 #include "nvk_clc3c0.h"
22 #include "nvk_clc397.h"
23 
24 static void
nvk_queue_state_init(struct nvk_queue_state * qs)25 nvk_queue_state_init(struct nvk_queue_state *qs)
26 {
27    memset(qs, 0, sizeof(*qs));
28 }
29 
30 static void
nvk_queue_state_finish(struct nvk_device * dev,struct nvk_queue_state * qs)31 nvk_queue_state_finish(struct nvk_device *dev,
32                        struct nvk_queue_state *qs)
33 {
34    if (qs->images.bo)
35       nouveau_ws_bo_destroy(qs->images.bo);
36    if (qs->samplers.bo)
37       nouveau_ws_bo_destroy(qs->samplers.bo);
38    if (qs->slm.bo)
39       nouveau_ws_bo_destroy(qs->slm.bo);
40    if (qs->push.bo) {
41       nouveau_ws_bo_unmap(qs->push.bo, qs->push.bo_map);
42       nouveau_ws_bo_destroy(qs->push.bo);
43    }
44 }
45 
46 static void
nvk_queue_state_dump_push(struct nvk_device * dev,struct nvk_queue_state * qs,FILE * fp)47 nvk_queue_state_dump_push(struct nvk_device *dev,
48                           struct nvk_queue_state *qs, FILE *fp)
49 {
50    struct nv_push push = {
51       .start = (uint32_t *)qs->push.bo_map,
52       .end = (uint32_t *)qs->push.bo_map + qs->push.dw_count,
53    };
54    vk_push_print(fp, &push, &dev->pdev->info);
55 }
56 
57 VkResult
nvk_queue_state_update(struct nvk_device * dev,struct nvk_queue_state * qs)58 nvk_queue_state_update(struct nvk_device *dev,
59                        struct nvk_queue_state *qs)
60 {
61    struct nouveau_ws_bo *bo;
62    uint32_t alloc_count, bytes_per_warp, bytes_per_tpc;
63    bool dirty = false;
64 
65    bo = nvk_descriptor_table_get_bo_ref(&dev->images, &alloc_count);
66    if (qs->images.bo != bo || qs->images.alloc_count != alloc_count) {
67       if (qs->images.bo)
68          nouveau_ws_bo_destroy(qs->images.bo);
69       qs->images.bo = bo;
70       qs->images.alloc_count = alloc_count;
71       dirty = true;
72    } else {
73       /* No change */
74       if (bo)
75          nouveau_ws_bo_destroy(bo);
76    }
77 
78    bo = nvk_descriptor_table_get_bo_ref(&dev->samplers, &alloc_count);
79    if (qs->samplers.bo != bo || qs->samplers.alloc_count != alloc_count) {
80       if (qs->samplers.bo)
81          nouveau_ws_bo_destroy(qs->samplers.bo);
82       qs->samplers.bo = bo;
83       qs->samplers.alloc_count = alloc_count;
84       dirty = true;
85    } else {
86       /* No change */
87       if (bo)
88          nouveau_ws_bo_destroy(bo);
89    }
90 
91    bo = nvk_slm_area_get_bo_ref(&dev->slm, &bytes_per_warp, &bytes_per_tpc);
92    if (qs->slm.bo != bo || qs->slm.bytes_per_warp != bytes_per_warp ||
93        qs->slm.bytes_per_tpc != bytes_per_tpc) {
94       if (qs->slm.bo)
95          nouveau_ws_bo_destroy(qs->slm.bo);
96       qs->slm.bo = bo;
97       qs->slm.bytes_per_warp = bytes_per_warp;
98       qs->slm.bytes_per_tpc = bytes_per_tpc;
99       dirty = true;
100    } else {
101       /* No change */
102       if (bo)
103          nouveau_ws_bo_destroy(bo);
104    }
105 
106    /* TODO: We're currently depending on kernel reference counting to protect
107     * us here.  If we ever stop reference counting in the kernel, we will
108     * either need to delay destruction or hold on to our extra BO references
109     * and insert a GPU stall here if anything has changed before dropping our
110     * old references.
111     */
112 
113    if (!dirty)
114       return VK_SUCCESS;
115 
116    struct nouveau_ws_bo *push_bo;
117    void *push_map;
118    push_bo = nouveau_ws_bo_new_mapped(dev->ws_dev, 256 * 4, 0,
119                                       NOUVEAU_WS_BO_GART |
120                                       NOUVEAU_WS_BO_MAP |
121                                       NOUVEAU_WS_BO_NO_SHARE,
122                                       NOUVEAU_WS_BO_WR, &push_map);
123    if (push_bo == NULL)
124       return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
125 
126    struct nv_push push;
127    nv_push_init(&push, push_map, 256);
128    struct nv_push *p = &push;
129 
130    if (qs->images.bo) {
131       /* Compute */
132       P_MTHD(p, NVA0C0, SET_TEX_HEADER_POOL_A);
133       P_NVA0C0_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
134       P_NVA0C0_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
135       P_NVA0C0_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
136       P_IMMD(p, NVA0C0, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
137          .lines = LINES_ALL
138       });
139 
140       /* 3D */
141       P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A);
142       P_NV9097_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
143       P_NV9097_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
144       P_NV9097_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
145       P_IMMD(p, NV9097, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
146          .lines = LINES_ALL
147       });
148    }
149 
150    if (qs->samplers.bo) {
151       /* Compute */
152       P_MTHD(p, NVA0C0, SET_TEX_SAMPLER_POOL_A);
153       P_NVA0C0_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
154       P_NVA0C0_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
155       P_NVA0C0_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
156       P_IMMD(p, NVA0C0, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
157          .lines = LINES_ALL
158       });
159 
160       /* 3D */
161       P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A);
162       P_NV9097_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
163       P_NV9097_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
164       P_NV9097_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
165       P_IMMD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
166          .lines = LINES_ALL
167       });
168    }
169 
170    if (qs->slm.bo) {
171       const uint64_t slm_addr = qs->slm.bo->offset;
172       const uint64_t slm_size = qs->slm.bo->size;
173       const uint64_t slm_per_warp = qs->slm.bytes_per_warp;
174       const uint64_t slm_per_tpc = qs->slm.bytes_per_tpc;
175       assert(!(slm_per_tpc & 0x7fff));
176 
177       /* Compute */
178       P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
179       P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
180       P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
181 
182       P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
183       P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(p, slm_per_tpc >> 32);
184       P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(p, slm_per_tpc);
185       P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(p, 0xff);
186 
187       if (dev->pdev->info.cls_compute < VOLTA_COMPUTE_A) {
188          P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
189          P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(p, slm_per_tpc >> 32);
190          P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(p, slm_per_tpc);
191          P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(p, 0xff);
192       }
193 
194       /* 3D */
195       P_MTHD(p, NV9097, SET_SHADER_LOCAL_MEMORY_A);
196       P_NV9097_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
197       P_NV9097_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
198       P_NV9097_SET_SHADER_LOCAL_MEMORY_C(p, slm_size >> 32);
199       P_NV9097_SET_SHADER_LOCAL_MEMORY_D(p, slm_size);
200       P_NV9097_SET_SHADER_LOCAL_MEMORY_E(p, slm_per_warp);
201    }
202 
203    /* We set memory windows unconditionally.  Otherwise, the memory window
204     * might be in a random place and cause us to fault off into nowhere.
205     */
206    if (dev->pdev->info.cls_compute >= VOLTA_COMPUTE_A) {
207       uint64_t temp = 0xfeULL << 24;
208       P_MTHD(p, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
209       P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(p, temp >> 32);
210       P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(p, temp & 0xffffffff);
211 
212       temp = 0xffULL << 24;
213       P_MTHD(p, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
214       P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(p, temp >> 32);
215       P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(p, temp & 0xffffffff);
216    } else {
217       P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
218       P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(p, 0xff << 24);
219 
220       P_MTHD(p, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
221       P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(p, 0xfe << 24);
222    }
223 
224    /* From nvc0_screen.c:
225     *
226     *    "Reduce likelihood of collision with real buffers by placing the
227     *    hole at the top of the 4G area. This will have to be dealt with
228     *    for real eventually by blocking off that area from the VM."
229     *
230     * Really?!?  TODO: Fix this for realz.  Annoyingly, we only have a
231     * 32-bit pointer for this in 3D rather than a full 48 like we have for
232     * compute.
233     */
234    P_IMMD(p, NV9097, SET_SHADER_LOCAL_MEMORY_WINDOW, 0xff << 24);
235 
236    if (qs->push.bo) {
237       nouveau_ws_bo_unmap(qs->push.bo, qs->push.bo_map);
238       nouveau_ws_bo_destroy(qs->push.bo);
239    }
240 
241    qs->push.bo = push_bo;
242    qs->push.bo_map = push_map;
243    qs->push.dw_count = nv_push_dw_count(&push);
244 
245    return VK_SUCCESS;
246 }
247 
248 static VkResult
nvk_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)249 nvk_queue_submit(struct vk_queue *vk_queue,
250                  struct vk_queue_submit *submit)
251 {
252    struct nvk_queue *queue = container_of(vk_queue, struct nvk_queue, vk);
253    struct nvk_device *dev = nvk_queue_device(queue);
254    VkResult result;
255 
256    if (vk_queue_is_lost(&queue->vk))
257       return VK_ERROR_DEVICE_LOST;
258 
259    result = nvk_queue_state_update(dev, &queue->state);
260    if (result != VK_SUCCESS) {
261       return vk_queue_set_lost(&queue->vk, "Failed to update queue base "
262                                            "pointers pushbuf");
263    }
264 
265    const bool sync = dev->ws_dev->debug_flags & NVK_DEBUG_PUSH_SYNC;
266 
267    result = nvk_queue_submit_drm_nouveau(queue, submit, sync);
268 
269    if ((sync && result != VK_SUCCESS) ||
270        (dev->ws_dev->debug_flags & NVK_DEBUG_PUSH_DUMP)) {
271       nvk_queue_state_dump_push(dev, &queue->state, stderr);
272 
273       for (unsigned i = 0; i < submit->command_buffer_count; i++) {
274          struct nvk_cmd_buffer *cmd =
275             container_of(submit->command_buffers[i], struct nvk_cmd_buffer, vk);
276 
277          nvk_cmd_buffer_dump(cmd, stderr);
278       }
279    }
280 
281    if (result != VK_SUCCESS)
282       return vk_queue_set_lost(&queue->vk, "Submit failed");
283 
284    return VK_SUCCESS;
285 }
286 
287 static VkResult
nvk_queue_init_context_state(struct nvk_queue * queue,VkQueueFlags queue_flags)288 nvk_queue_init_context_state(struct nvk_queue *queue,
289                              VkQueueFlags queue_flags)
290 {
291    struct nvk_device *dev = nvk_queue_device(queue);
292    struct nvk_physical_device *pdev = nvk_device_physical(dev);
293    VkResult result;
294 
295    uint32_t push_data[2048];
296    struct nv_push push;
297    nv_push_init(&push, push_data, ARRAY_SIZE(push_data));
298    struct nv_push *p = &push;
299 
300    /* M2MF state */
301    if (pdev->info.cls_m2mf <= FERMI_MEMORY_TO_MEMORY_FORMAT_A) {
302       /* we absolutely do not support Fermi, but if somebody wants to toy
303        * around with it, this is a must
304        */
305       P_MTHD(p, NV9039, SET_OBJECT);
306       P_NV9039_SET_OBJECT(p, {
307          .class_id = dev->pdev->info.cls_m2mf,
308          .engine_id = 0,
309       });
310    }
311 
312    if (queue_flags & VK_QUEUE_GRAPHICS_BIT) {
313       result = nvk_push_draw_state_init(dev, p);
314       if (result != VK_SUCCESS)
315          return result;
316    }
317 
318    if (queue_flags & VK_QUEUE_COMPUTE_BIT) {
319       result = nvk_push_dispatch_state_init(dev, p);
320       if (result != VK_SUCCESS)
321          return result;
322    }
323 
324    return nvk_queue_submit_simple(queue, nv_push_dw_count(&push),
325                                   push_data, 0, NULL);
326 }
327 
328 VkResult
nvk_queue_init(struct nvk_device * dev,struct nvk_queue * queue,const VkDeviceQueueCreateInfo * pCreateInfo,uint32_t index_in_family)329 nvk_queue_init(struct nvk_device *dev, struct nvk_queue *queue,
330                const VkDeviceQueueCreateInfo *pCreateInfo,
331                uint32_t index_in_family)
332 {
333    struct nvk_physical_device *pdev = nvk_device_physical(dev);
334    VkResult result;
335 
336    assert(pCreateInfo->queueFamilyIndex < pdev->queue_family_count);
337    const struct nvk_queue_family *queue_family =
338       &pdev->queue_families[pCreateInfo->queueFamilyIndex];
339 
340    VkQueueFlags queue_flags = queue_family->queue_flags;
341 
342    /* We rely on compute shaders for queries */
343    if (queue_family->queue_flags & VK_QUEUE_GRAPHICS_BIT)
344       queue_flags |= VK_QUEUE_COMPUTE_BIT;
345 
346    /* We currently rely on 3D engine MMEs for indirect dispatch */
347    if (queue_family->queue_flags & VK_QUEUE_COMPUTE_BIT)
348       queue_flags |= VK_QUEUE_GRAPHICS_BIT;
349 
350    result = vk_queue_init(&queue->vk, &dev->vk, pCreateInfo, index_in_family);
351    if (result != VK_SUCCESS)
352       return result;
353 
354    queue->vk.driver_submit = nvk_queue_submit;
355 
356    nvk_queue_state_init(&queue->state);
357 
358    result = nvk_queue_init_drm_nouveau(dev, queue, queue_flags);
359    if (result != VK_SUCCESS)
360       goto fail_init;
361 
362    result = nvk_queue_init_context_state(queue, queue_flags);
363    if (result != VK_SUCCESS)
364       goto fail_drm;
365 
366    return VK_SUCCESS;
367 
368 fail_drm:
369    nvk_queue_finish_drm_nouveau(dev, queue);
370 fail_init:
371    vk_queue_finish(&queue->vk);
372 
373    return result;
374 }
375 
376 void
nvk_queue_finish(struct nvk_device * dev,struct nvk_queue * queue)377 nvk_queue_finish(struct nvk_device *dev, struct nvk_queue *queue)
378 {
379    nvk_queue_state_finish(dev, &queue->state);
380    nvk_queue_finish_drm_nouveau(dev, queue);
381    vk_queue_finish(&queue->vk);
382 }
383 
384 VkResult
nvk_queue_submit_simple(struct nvk_queue * queue,uint32_t dw_count,const uint32_t * dw,uint32_t extra_bo_count,struct nouveau_ws_bo ** extra_bos)385 nvk_queue_submit_simple(struct nvk_queue *queue,
386                         uint32_t dw_count, const uint32_t *dw,
387                         uint32_t extra_bo_count,
388                         struct nouveau_ws_bo **extra_bos)
389 {
390    struct nvk_device *dev = nvk_queue_device(queue);
391    struct nouveau_ws_bo *push_bo;
392    VkResult result;
393 
394    if (vk_queue_is_lost(&queue->vk))
395       return VK_ERROR_DEVICE_LOST;
396 
397    void *push_map;
398    push_bo = nouveau_ws_bo_new_mapped(dev->ws_dev, dw_count * 4, 0,
399                                       NOUVEAU_WS_BO_GART |
400                                       NOUVEAU_WS_BO_MAP |
401                                       NOUVEAU_WS_BO_NO_SHARE,
402                                       NOUVEAU_WS_BO_WR, &push_map);
403    if (push_bo == NULL)
404       return vk_error(queue, VK_ERROR_OUT_OF_DEVICE_MEMORY);
405 
406    memcpy(push_map, dw, dw_count * 4);
407 
408    result = nvk_queue_submit_simple_drm_nouveau(queue, dw_count, push_bo,
409                                                 extra_bo_count, extra_bos);
410 
411    const bool debug_sync = dev->ws_dev->debug_flags & NVK_DEBUG_PUSH_SYNC;
412    if ((debug_sync && result != VK_SUCCESS) ||
413        (dev->ws_dev->debug_flags & NVK_DEBUG_PUSH_DUMP)) {
414       struct nv_push push = {
415          .start = (uint32_t *)dw,
416          .end = (uint32_t *)dw + dw_count,
417       };
418       vk_push_print(stderr, &push, &dev->pdev->info);
419    }
420 
421    nouveau_ws_bo_unmap(push_bo, push_map);
422    nouveau_ws_bo_destroy(push_bo);
423 
424    if (result != VK_SUCCESS)
425       return vk_queue_set_lost(&queue->vk, "Submit failed");
426 
427    return VK_SUCCESS;
428 }
429