1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_device.h"
6
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_entrypoints.h"
9 #include "nvk_instance.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12
13 #include "vk_pipeline_cache.h"
14 #include "vulkan/wsi/wsi_common.h"
15
16 #include "nouveau_context.h"
17
18 #include <fcntl.h>
19 #include <xf86drm.h>
20
21 #include "cl9097.h"
22 #include "clb097.h"
23 #include "clc397.h"
24
25 static void
nvk_slm_area_init(struct nvk_slm_area * area)26 nvk_slm_area_init(struct nvk_slm_area *area)
27 {
28 memset(area, 0, sizeof(*area));
29 simple_mtx_init(&area->mutex, mtx_plain);
30 }
31
32 static void
nvk_slm_area_finish(struct nvk_slm_area * area)33 nvk_slm_area_finish(struct nvk_slm_area *area)
34 {
35 simple_mtx_destroy(&area->mutex);
36 if (area->bo)
37 nouveau_ws_bo_destroy(area->bo);
38 }
39
40 struct nouveau_ws_bo *
nvk_slm_area_get_bo_ref(struct nvk_slm_area * area,uint32_t * bytes_per_warp_out,uint32_t * bytes_per_tpc_out)41 nvk_slm_area_get_bo_ref(struct nvk_slm_area *area,
42 uint32_t *bytes_per_warp_out,
43 uint32_t *bytes_per_tpc_out)
44 {
45 simple_mtx_lock(&area->mutex);
46 struct nouveau_ws_bo *bo = area->bo;
47 if (bo)
48 nouveau_ws_bo_ref(bo);
49 *bytes_per_warp_out = area->bytes_per_warp;
50 *bytes_per_tpc_out = area->bytes_per_tpc;
51 simple_mtx_unlock(&area->mutex);
52
53 return bo;
54 }
55
56 static VkResult
nvk_slm_area_ensure(struct nvk_device * dev,struct nvk_slm_area * area,uint32_t bytes_per_thread)57 nvk_slm_area_ensure(struct nvk_device *dev,
58 struct nvk_slm_area *area,
59 uint32_t bytes_per_thread)
60 {
61 assert(bytes_per_thread < (1 << 24));
62
63 /* TODO: Volta+doesn't use CRC */
64 const uint32_t crs_size = 0;
65
66 uint64_t bytes_per_warp = bytes_per_thread * 32 + crs_size;
67
68 /* The hardware seems to require this alignment for
69 * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP
70 */
71 bytes_per_warp = align64(bytes_per_warp, 0x200);
72
73 uint64_t bytes_per_mp = bytes_per_warp * dev->pdev->info.max_warps_per_mp;
74 uint64_t bytes_per_tpc = bytes_per_mp * dev->pdev->info.mp_per_tpc;
75
76 /* The hardware seems to require this alignment for
77 * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER.
78 */
79 bytes_per_tpc = align64(bytes_per_tpc, 0x8000);
80
81 /* nvk_slm_area::bytes_per_mp only ever increases so we can check this
82 * outside the lock and exit early in the common case. We only need to
83 * take the lock if we're actually going to resize.
84 *
85 * Also, we only care about bytes_per_mp and not bytes_per_warp because
86 * they are integer multiples of each other.
87 */
88 if (likely(bytes_per_tpc <= area->bytes_per_tpc))
89 return VK_SUCCESS;
90
91 uint64_t size = bytes_per_tpc * dev->pdev->info.tpc_count;
92
93 /* The hardware seems to require this alignment for
94 * NV9097_SET_SHADER_LOCAL_MEMORY_D_SIZE_LOWER.
95 */
96 size = align64(size, 0x20000);
97
98 struct nouveau_ws_bo *bo =
99 nouveau_ws_bo_new(dev->ws_dev, size, 0,
100 NOUVEAU_WS_BO_LOCAL | NOUVEAU_WS_BO_NO_SHARE);
101 if (bo == NULL)
102 return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
103
104 struct nouveau_ws_bo *unref_bo;
105 simple_mtx_lock(&area->mutex);
106 if (bytes_per_tpc <= area->bytes_per_tpc) {
107 /* We lost the race, throw away our BO */
108 assert(area->bytes_per_warp == bytes_per_warp);
109 unref_bo = bo;
110 } else {
111 unref_bo = area->bo;
112 area->bo = bo;
113 area->bytes_per_warp = bytes_per_warp;
114 area->bytes_per_tpc = bytes_per_tpc;
115 }
116 simple_mtx_unlock(&area->mutex);
117
118 if (unref_bo)
119 nouveau_ws_bo_destroy(unref_bo);
120
121 return VK_SUCCESS;
122 }
123
124 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)125 nvk_CreateDevice(VkPhysicalDevice physicalDevice,
126 const VkDeviceCreateInfo *pCreateInfo,
127 const VkAllocationCallbacks *pAllocator,
128 VkDevice *pDevice)
129 {
130 VK_FROM_HANDLE(nvk_physical_device, pdev, physicalDevice);
131 VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
132 struct nvk_device *dev;
133
134 dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator,
135 sizeof(*dev), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
136 if (!dev)
137 return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
138
139 struct vk_device_dispatch_table dispatch_table;
140 vk_device_dispatch_table_from_entrypoints(&dispatch_table,
141 &nvk_device_entrypoints, true);
142 vk_device_dispatch_table_from_entrypoints(&dispatch_table,
143 &wsi_device_entrypoints, false);
144
145 result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table,
146 pCreateInfo, pAllocator);
147 if (result != VK_SUCCESS)
148 goto fail_alloc;
149
150 dev->vk.shader_ops = &nvk_device_shader_ops;
151
152 drmDevicePtr drm_device = NULL;
153 int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device);
154 if (ret != 0) {
155 result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
156 "Failed to get DRM device: %m");
157 goto fail_init;
158 }
159
160 dev->ws_dev = nouveau_ws_device_new(drm_device);
161 drmFreeDevice(&drm_device);
162 if (dev->ws_dev == NULL) {
163 result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
164 "Failed to get DRM device: %m");
165 goto fail_init;
166 }
167
168 vk_device_set_drm_fd(&dev->vk, dev->ws_dev->fd);
169 dev->vk.command_buffer_ops = &nvk_cmd_buffer_ops;
170 dev->pdev = pdev;
171
172 result = nvk_upload_queue_init(dev, &dev->upload);
173 if (result != VK_SUCCESS)
174 goto fail_ws_dev;
175
176 result = nvk_descriptor_table_init(dev, &dev->images,
177 8 * 4 /* tic entry size */,
178 1024, 1024 * 1024);
179 if (result != VK_SUCCESS)
180 goto fail_upload;
181
182 /* Reserve the descriptor at offset 0 to be the null descriptor */
183 uint32_t null_image[8] = { 0, };
184 ASSERTED uint32_t null_image_index;
185 result = nvk_descriptor_table_add(dev, &dev->images,
186 null_image, sizeof(null_image),
187 &null_image_index);
188 assert(result == VK_SUCCESS);
189 assert(null_image_index == 0);
190
191 result = nvk_descriptor_table_init(dev, &dev->samplers,
192 8 * 4 /* tsc entry size */,
193 4096, 4096);
194 if (result != VK_SUCCESS)
195 goto fail_images;
196
197 /* If we have a full BAR, go ahead and do shader uploads on the CPU.
198 * Otherwise, we fall back to doing shader uploads via the upload queue.
199 *
200 * Also, the I-cache pre-fetches and we don't really know by how much.
201 * Over-allocating shader BOs by 4K ensures we don't run past.
202 */
203 enum nouveau_ws_bo_map_flags shader_map_flags = 0;
204 if (pdev->info.bar_size_B >= pdev->info.vram_size_B)
205 shader_map_flags = NOUVEAU_WS_BO_WR;
206 result = nvk_heap_init(dev, &dev->shader_heap,
207 NOUVEAU_WS_BO_LOCAL | NOUVEAU_WS_BO_NO_SHARE,
208 shader_map_flags,
209 4096 /* overalloc */,
210 dev->pdev->info.cls_eng3d < VOLTA_A);
211 if (result != VK_SUCCESS)
212 goto fail_samplers;
213
214 result = nvk_heap_init(dev, &dev->event_heap,
215 NOUVEAU_WS_BO_LOCAL | NOUVEAU_WS_BO_NO_SHARE,
216 NOUVEAU_WS_BO_WR,
217 0 /* overalloc */, false /* contiguous */);
218 if (result != VK_SUCCESS)
219 goto fail_shader_heap;
220
221 nvk_slm_area_init(&dev->slm);
222
223 void *zero_map;
224 dev->zero_page = nouveau_ws_bo_new_mapped(dev->ws_dev, 0x1000, 0,
225 NOUVEAU_WS_BO_LOCAL |
226 NOUVEAU_WS_BO_NO_SHARE,
227 NOUVEAU_WS_BO_WR, &zero_map);
228 if (dev->zero_page == NULL)
229 goto fail_slm;
230
231 memset(zero_map, 0, 0x1000);
232 nouveau_ws_bo_unmap(dev->zero_page, zero_map);
233
234 if (dev->pdev->info.cls_eng3d >= FERMI_A &&
235 dev->pdev->info.cls_eng3d < MAXWELL_A) {
236 /* max size is 256k */
237 dev->vab_memory = nouveau_ws_bo_new(dev->ws_dev, 1 << 17, 1 << 20,
238 NOUVEAU_WS_BO_LOCAL |
239 NOUVEAU_WS_BO_NO_SHARE);
240 if (dev->vab_memory == NULL)
241 goto fail_zero_page;
242 }
243
244 result = nvk_queue_init(dev, &dev->queue,
245 &pCreateInfo->pQueueCreateInfos[0], 0);
246 if (result != VK_SUCCESS)
247 goto fail_vab_memory;
248
249 struct vk_pipeline_cache_create_info cache_info = {
250 .weak_ref = true,
251 };
252 dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
253 if (dev->mem_cache == NULL) {
254 result = VK_ERROR_OUT_OF_HOST_MEMORY;
255 goto fail_queue;
256 }
257
258 result = nvk_device_init_meta(dev);
259 if (result != VK_SUCCESS)
260 goto fail_mem_cache;
261
262 *pDevice = nvk_device_to_handle(dev);
263
264 return VK_SUCCESS;
265
266 fail_mem_cache:
267 vk_pipeline_cache_destroy(dev->mem_cache, NULL);
268 fail_queue:
269 nvk_queue_finish(dev, &dev->queue);
270 fail_vab_memory:
271 if (dev->vab_memory)
272 nouveau_ws_bo_destroy(dev->vab_memory);
273 fail_zero_page:
274 nouveau_ws_bo_destroy(dev->zero_page);
275 fail_slm:
276 nvk_slm_area_finish(&dev->slm);
277 nvk_heap_finish(dev, &dev->event_heap);
278 fail_shader_heap:
279 nvk_heap_finish(dev, &dev->shader_heap);
280 fail_samplers:
281 nvk_descriptor_table_finish(dev, &dev->samplers);
282 fail_images:
283 nvk_descriptor_table_finish(dev, &dev->images);
284 fail_upload:
285 nvk_upload_queue_finish(dev, &dev->upload);
286 fail_ws_dev:
287 nouveau_ws_device_destroy(dev->ws_dev);
288 fail_init:
289 vk_device_finish(&dev->vk);
290 fail_alloc:
291 vk_free(&dev->vk.alloc, dev);
292 return result;
293 }
294
295 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)296 nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
297 {
298 VK_FROM_HANDLE(nvk_device, dev, _device);
299
300 if (!dev)
301 return;
302
303 nvk_device_finish_meta(dev);
304
305 vk_pipeline_cache_destroy(dev->mem_cache, NULL);
306 nvk_queue_finish(dev, &dev->queue);
307 if (dev->vab_memory)
308 nouveau_ws_bo_destroy(dev->vab_memory);
309 nouveau_ws_bo_destroy(dev->zero_page);
310 vk_device_finish(&dev->vk);
311
312 /* Idle the upload queue before we tear down heaps */
313 nvk_upload_queue_sync(dev, &dev->upload);
314
315 nvk_slm_area_finish(&dev->slm);
316 nvk_heap_finish(dev, &dev->event_heap);
317 nvk_heap_finish(dev, &dev->shader_heap);
318 nvk_descriptor_table_finish(dev, &dev->samplers);
319 nvk_descriptor_table_finish(dev, &dev->images);
320 nvk_upload_queue_finish(dev, &dev->upload);
321 nouveau_ws_device_destroy(dev->ws_dev);
322 vk_free(&dev->vk.alloc, dev);
323 }
324
325 VkResult
nvk_device_ensure_slm(struct nvk_device * dev,uint32_t bytes_per_thread)326 nvk_device_ensure_slm(struct nvk_device *dev,
327 uint32_t bytes_per_thread)
328 {
329 return nvk_slm_area_ensure(dev, &dev->slm, bytes_per_thread);
330 }
331