• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_device.h"
6 
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_entrypoints.h"
9 #include "nvk_instance.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12 
13 #include "vk_pipeline_cache.h"
14 #include "vulkan/wsi/wsi_common.h"
15 
16 #include "nouveau_context.h"
17 
18 #include <fcntl.h>
19 #include <xf86drm.h>
20 
21 #include "cl9097.h"
22 #include "clb097.h"
23 #include "clc397.h"
24 
25 static void
nvk_slm_area_init(struct nvk_slm_area * area)26 nvk_slm_area_init(struct nvk_slm_area *area)
27 {
28    memset(area, 0, sizeof(*area));
29    simple_mtx_init(&area->mutex, mtx_plain);
30 }
31 
32 static void
nvk_slm_area_finish(struct nvk_slm_area * area)33 nvk_slm_area_finish(struct nvk_slm_area *area)
34 {
35    simple_mtx_destroy(&area->mutex);
36    if (area->bo)
37       nouveau_ws_bo_destroy(area->bo);
38 }
39 
40 struct nouveau_ws_bo *
nvk_slm_area_get_bo_ref(struct nvk_slm_area * area,uint32_t * bytes_per_warp_out,uint32_t * bytes_per_tpc_out)41 nvk_slm_area_get_bo_ref(struct nvk_slm_area *area,
42                         uint32_t *bytes_per_warp_out,
43                         uint32_t *bytes_per_tpc_out)
44 {
45    simple_mtx_lock(&area->mutex);
46    struct nouveau_ws_bo *bo = area->bo;
47    if (bo)
48       nouveau_ws_bo_ref(bo);
49    *bytes_per_warp_out = area->bytes_per_warp;
50    *bytes_per_tpc_out = area->bytes_per_tpc;
51    simple_mtx_unlock(&area->mutex);
52 
53    return bo;
54 }
55 
56 static VkResult
nvk_slm_area_ensure(struct nvk_device * dev,struct nvk_slm_area * area,uint32_t bytes_per_thread)57 nvk_slm_area_ensure(struct nvk_device *dev,
58                     struct nvk_slm_area *area,
59                     uint32_t bytes_per_thread)
60 {
61    assert(bytes_per_thread < (1 << 24));
62 
63    /* TODO: Volta+doesn't use CRC */
64    const uint32_t crs_size = 0;
65 
66    uint64_t bytes_per_warp = bytes_per_thread * 32 + crs_size;
67 
68    /* The hardware seems to require this alignment for
69     * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP
70     */
71    bytes_per_warp = align64(bytes_per_warp, 0x200);
72 
73    uint64_t bytes_per_mp = bytes_per_warp * dev->pdev->info.max_warps_per_mp;
74    uint64_t bytes_per_tpc = bytes_per_mp * dev->pdev->info.mp_per_tpc;
75 
76    /* The hardware seems to require this alignment for
77     * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER.
78     */
79    bytes_per_tpc = align64(bytes_per_tpc, 0x8000);
80 
81    /* nvk_slm_area::bytes_per_mp only ever increases so we can check this
82     * outside the lock and exit early in the common case.  We only need to
83     * take the lock if we're actually going to resize.
84     *
85     * Also, we only care about bytes_per_mp and not bytes_per_warp because
86     * they are integer multiples of each other.
87     */
88    if (likely(bytes_per_tpc <= area->bytes_per_tpc))
89       return VK_SUCCESS;
90 
91    uint64_t size = bytes_per_tpc * dev->pdev->info.tpc_count;
92 
93    /* The hardware seems to require this alignment for
94     * NV9097_SET_SHADER_LOCAL_MEMORY_D_SIZE_LOWER.
95     */
96    size = align64(size, 0x20000);
97 
98    struct nouveau_ws_bo *bo =
99       nouveau_ws_bo_new(dev->ws_dev, size, 0,
100                         NOUVEAU_WS_BO_LOCAL | NOUVEAU_WS_BO_NO_SHARE);
101    if (bo == NULL)
102       return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
103 
104    struct nouveau_ws_bo *unref_bo;
105    simple_mtx_lock(&area->mutex);
106    if (bytes_per_tpc <= area->bytes_per_tpc) {
107       /* We lost the race, throw away our BO */
108       assert(area->bytes_per_warp == bytes_per_warp);
109       unref_bo = bo;
110    } else {
111       unref_bo = area->bo;
112       area->bo = bo;
113       area->bytes_per_warp = bytes_per_warp;
114       area->bytes_per_tpc = bytes_per_tpc;
115    }
116    simple_mtx_unlock(&area->mutex);
117 
118    if (unref_bo)
119       nouveau_ws_bo_destroy(unref_bo);
120 
121    return VK_SUCCESS;
122 }
123 
124 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)125 nvk_CreateDevice(VkPhysicalDevice physicalDevice,
126                  const VkDeviceCreateInfo *pCreateInfo,
127                  const VkAllocationCallbacks *pAllocator,
128                  VkDevice *pDevice)
129 {
130    VK_FROM_HANDLE(nvk_physical_device, pdev, physicalDevice);
131    VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
132    struct nvk_device *dev;
133 
134    dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator,
135                     sizeof(*dev), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
136    if (!dev)
137       return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
138 
139    struct vk_device_dispatch_table dispatch_table;
140    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
141                                              &nvk_device_entrypoints, true);
142    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
143                                              &wsi_device_entrypoints, false);
144 
145    result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table,
146                            pCreateInfo, pAllocator);
147    if (result != VK_SUCCESS)
148       goto fail_alloc;
149 
150    dev->vk.shader_ops = &nvk_device_shader_ops;
151 
152    drmDevicePtr drm_device = NULL;
153    int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device);
154    if (ret != 0) {
155       result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
156                          "Failed to get DRM device: %m");
157       goto fail_init;
158    }
159 
160    dev->ws_dev = nouveau_ws_device_new(drm_device);
161    drmFreeDevice(&drm_device);
162    if (dev->ws_dev == NULL) {
163       result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
164                          "Failed to get DRM device: %m");
165       goto fail_init;
166    }
167 
168    vk_device_set_drm_fd(&dev->vk, dev->ws_dev->fd);
169    dev->vk.command_buffer_ops = &nvk_cmd_buffer_ops;
170    dev->pdev = pdev;
171 
172    result = nvk_upload_queue_init(dev, &dev->upload);
173    if (result != VK_SUCCESS)
174       goto fail_ws_dev;
175 
176    result = nvk_descriptor_table_init(dev, &dev->images,
177                                       8 * 4 /* tic entry size */,
178                                       1024, 1024 * 1024);
179    if (result != VK_SUCCESS)
180       goto fail_upload;
181 
182    /* Reserve the descriptor at offset 0 to be the null descriptor */
183    uint32_t null_image[8] = { 0, };
184    ASSERTED uint32_t null_image_index;
185    result = nvk_descriptor_table_add(dev, &dev->images,
186                                      null_image, sizeof(null_image),
187                                      &null_image_index);
188    assert(result == VK_SUCCESS);
189    assert(null_image_index == 0);
190 
191    result = nvk_descriptor_table_init(dev, &dev->samplers,
192                                       8 * 4 /* tsc entry size */,
193                                       4096, 4096);
194    if (result != VK_SUCCESS)
195       goto fail_images;
196 
197    /* If we have a full BAR, go ahead and do shader uploads on the CPU.
198     * Otherwise, we fall back to doing shader uploads via the upload queue.
199     *
200     * Also, the I-cache pre-fetches and we don't really know by how much.
201     * Over-allocating shader BOs by 4K ensures we don't run past.
202     */
203    enum nouveau_ws_bo_map_flags shader_map_flags = 0;
204    if (pdev->info.bar_size_B >= pdev->info.vram_size_B)
205       shader_map_flags = NOUVEAU_WS_BO_WR;
206    result = nvk_heap_init(dev, &dev->shader_heap,
207                           NOUVEAU_WS_BO_LOCAL | NOUVEAU_WS_BO_NO_SHARE,
208                           shader_map_flags,
209                           4096 /* overalloc */,
210                           dev->pdev->info.cls_eng3d < VOLTA_A);
211    if (result != VK_SUCCESS)
212       goto fail_samplers;
213 
214    result = nvk_heap_init(dev, &dev->event_heap,
215                           NOUVEAU_WS_BO_LOCAL | NOUVEAU_WS_BO_NO_SHARE,
216                           NOUVEAU_WS_BO_WR,
217                           0 /* overalloc */, false /* contiguous */);
218    if (result != VK_SUCCESS)
219       goto fail_shader_heap;
220 
221    nvk_slm_area_init(&dev->slm);
222 
223    void *zero_map;
224    dev->zero_page = nouveau_ws_bo_new_mapped(dev->ws_dev, 0x1000, 0,
225                                              NOUVEAU_WS_BO_LOCAL |
226                                              NOUVEAU_WS_BO_NO_SHARE,
227                                              NOUVEAU_WS_BO_WR, &zero_map);
228    if (dev->zero_page == NULL)
229       goto fail_slm;
230 
231    memset(zero_map, 0, 0x1000);
232    nouveau_ws_bo_unmap(dev->zero_page, zero_map);
233 
234    if (dev->pdev->info.cls_eng3d >= FERMI_A &&
235        dev->pdev->info.cls_eng3d < MAXWELL_A) {
236       /* max size is 256k */
237       dev->vab_memory = nouveau_ws_bo_new(dev->ws_dev, 1 << 17, 1 << 20,
238                                           NOUVEAU_WS_BO_LOCAL |
239                                           NOUVEAU_WS_BO_NO_SHARE);
240       if (dev->vab_memory == NULL)
241          goto fail_zero_page;
242    }
243 
244    result = nvk_queue_init(dev, &dev->queue,
245                            &pCreateInfo->pQueueCreateInfos[0], 0);
246    if (result != VK_SUCCESS)
247       goto fail_vab_memory;
248 
249    struct vk_pipeline_cache_create_info cache_info = {
250       .weak_ref = true,
251    };
252    dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
253    if (dev->mem_cache == NULL) {
254       result = VK_ERROR_OUT_OF_HOST_MEMORY;
255       goto fail_queue;
256    }
257 
258    result = nvk_device_init_meta(dev);
259    if (result != VK_SUCCESS)
260       goto fail_mem_cache;
261 
262    *pDevice = nvk_device_to_handle(dev);
263 
264    return VK_SUCCESS;
265 
266 fail_mem_cache:
267    vk_pipeline_cache_destroy(dev->mem_cache, NULL);
268 fail_queue:
269    nvk_queue_finish(dev, &dev->queue);
270 fail_vab_memory:
271    if (dev->vab_memory)
272       nouveau_ws_bo_destroy(dev->vab_memory);
273 fail_zero_page:
274    nouveau_ws_bo_destroy(dev->zero_page);
275 fail_slm:
276    nvk_slm_area_finish(&dev->slm);
277    nvk_heap_finish(dev, &dev->event_heap);
278 fail_shader_heap:
279    nvk_heap_finish(dev, &dev->shader_heap);
280 fail_samplers:
281    nvk_descriptor_table_finish(dev, &dev->samplers);
282 fail_images:
283    nvk_descriptor_table_finish(dev, &dev->images);
284 fail_upload:
285    nvk_upload_queue_finish(dev, &dev->upload);
286 fail_ws_dev:
287    nouveau_ws_device_destroy(dev->ws_dev);
288 fail_init:
289    vk_device_finish(&dev->vk);
290 fail_alloc:
291    vk_free(&dev->vk.alloc, dev);
292    return result;
293 }
294 
295 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)296 nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
297 {
298    VK_FROM_HANDLE(nvk_device, dev, _device);
299 
300    if (!dev)
301       return;
302 
303    nvk_device_finish_meta(dev);
304 
305    vk_pipeline_cache_destroy(dev->mem_cache, NULL);
306    nvk_queue_finish(dev, &dev->queue);
307    if (dev->vab_memory)
308       nouveau_ws_bo_destroy(dev->vab_memory);
309    nouveau_ws_bo_destroy(dev->zero_page);
310    vk_device_finish(&dev->vk);
311 
312    /* Idle the upload queue before we tear down heaps */
313    nvk_upload_queue_sync(dev, &dev->upload);
314 
315    nvk_slm_area_finish(&dev->slm);
316    nvk_heap_finish(dev, &dev->event_heap);
317    nvk_heap_finish(dev, &dev->shader_heap);
318    nvk_descriptor_table_finish(dev, &dev->samplers);
319    nvk_descriptor_table_finish(dev, &dev->images);
320    nvk_upload_queue_finish(dev, &dev->upload);
321    nouveau_ws_device_destroy(dev->ws_dev);
322    vk_free(&dev->vk.alloc, dev);
323 }
324 
325 VkResult
nvk_device_ensure_slm(struct nvk_device * dev,uint32_t bytes_per_thread)326 nvk_device_ensure_slm(struct nvk_device *dev,
327                       uint32_t bytes_per_thread)
328 {
329    return nvk_slm_area_ensure(dev, &dev->slm, bytes_per_thread);
330 }
331