1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_device.h"
6
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_entrypoints.h"
9 #include "nvk_instance.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12 #include "nvkmd/nvkmd.h"
13
14 #include "vk_pipeline_cache.h"
15 #include "vulkan/wsi/wsi_common.h"
16
17 #include "cl9097.h"
18 #include "clb097.h"
19 #include "clc397.h"
20
21 static void
nvk_slm_area_init(struct nvk_slm_area * area)22 nvk_slm_area_init(struct nvk_slm_area *area)
23 {
24 memset(area, 0, sizeof(*area));
25 simple_mtx_init(&area->mutex, mtx_plain);
26 }
27
28 static void
nvk_slm_area_finish(struct nvk_slm_area * area)29 nvk_slm_area_finish(struct nvk_slm_area *area)
30 {
31 simple_mtx_destroy(&area->mutex);
32 if (area->mem)
33 nvkmd_mem_unref(area->mem);
34 }
35
36 struct nvkmd_mem *
nvk_slm_area_get_mem_ref(struct nvk_slm_area * area,uint32_t * bytes_per_warp_out,uint32_t * bytes_per_tpc_out)37 nvk_slm_area_get_mem_ref(struct nvk_slm_area *area,
38 uint32_t *bytes_per_warp_out,
39 uint32_t *bytes_per_tpc_out)
40 {
41 simple_mtx_lock(&area->mutex);
42 struct nvkmd_mem *mem = area->mem;
43 if (mem)
44 nvkmd_mem_ref(mem);
45 *bytes_per_warp_out = area->bytes_per_warp;
46 *bytes_per_tpc_out = area->bytes_per_tpc;
47 simple_mtx_unlock(&area->mutex);
48
49 return mem;
50 }
51
52 static VkResult
nvk_slm_area_ensure(struct nvk_device * dev,struct nvk_slm_area * area,uint32_t slm_bytes_per_lane,uint32_t crs_bytes_per_warp)53 nvk_slm_area_ensure(struct nvk_device *dev,
54 struct nvk_slm_area *area,
55 uint32_t slm_bytes_per_lane,
56 uint32_t crs_bytes_per_warp)
57 {
58 struct nvk_physical_device *pdev = nvk_device_physical(dev);
59 VkResult result;
60
61 assert(slm_bytes_per_lane < (1 << 24));
62 assert(crs_bytes_per_warp <= (1 << 20));
63 uint64_t bytes_per_warp = slm_bytes_per_lane * 32 + crs_bytes_per_warp;
64
65 /* The hardware seems to require this alignment for
66 * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP
67 */
68 bytes_per_warp = align64(bytes_per_warp, 0x200);
69
70 uint64_t bytes_per_mp = bytes_per_warp * pdev->info.max_warps_per_mp;
71 uint64_t bytes_per_tpc = bytes_per_mp * pdev->info.mp_per_tpc;
72
73 /* The hardware seems to require this alignment for
74 * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER.
75 */
76 bytes_per_tpc = align64(bytes_per_tpc, 0x8000);
77
78 /* nvk_slm_area::bytes_per_mp only ever increases so we can check this
79 * outside the lock and exit early in the common case. We only need to
80 * take the lock if we're actually going to resize.
81 *
82 * Also, we only care about bytes_per_mp and not bytes_per_warp because
83 * they are integer multiples of each other.
84 */
85 if (likely(bytes_per_tpc <= area->bytes_per_tpc))
86 return VK_SUCCESS;
87
88 uint64_t size = bytes_per_tpc * pdev->info.tpc_count;
89
90 /* The hardware seems to require this alignment for
91 * NV9097_SET_SHADER_LOCAL_MEMORY_D_SIZE_LOWER.
92 */
93 size = align64(size, 0x20000);
94
95 struct nvkmd_mem *mem;
96 result = nvkmd_dev_alloc_mem(dev->nvkmd, &dev->vk.base, size, 0,
97 NVKMD_MEM_LOCAL, &mem);
98 if (result != VK_SUCCESS)
99 return result;
100
101 struct nvkmd_mem *unref_mem;
102 simple_mtx_lock(&area->mutex);
103 if (bytes_per_tpc <= area->bytes_per_tpc) {
104 /* We lost the race, throw away our BO */
105 assert(area->bytes_per_warp == bytes_per_warp);
106 unref_mem = mem;
107 } else {
108 unref_mem = area->mem;
109 area->mem = mem;
110 area->bytes_per_warp = bytes_per_warp;
111 area->bytes_per_tpc = bytes_per_tpc;
112 }
113 simple_mtx_unlock(&area->mutex);
114
115 if (unref_mem)
116 nvkmd_mem_unref(unref_mem);
117
118 return VK_SUCCESS;
119 }
120
121 static VkResult
nvk_device_get_timestamp(struct vk_device * vk_dev,uint64_t * timestamp)122 nvk_device_get_timestamp(struct vk_device *vk_dev, uint64_t *timestamp)
123 {
124 struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
125 *timestamp = nvkmd_dev_get_gpu_timestamp(dev->nvkmd);
126 return VK_SUCCESS;
127 }
128
129 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)130 nvk_CreateDevice(VkPhysicalDevice physicalDevice,
131 const VkDeviceCreateInfo *pCreateInfo,
132 const VkAllocationCallbacks *pAllocator,
133 VkDevice *pDevice)
134 {
135 VK_FROM_HANDLE(nvk_physical_device, pdev, physicalDevice);
136 VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
137 struct nvk_device *dev;
138
139 dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator,
140 sizeof(*dev), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
141 if (!dev)
142 return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
143
144 struct vk_device_dispatch_table dispatch_table;
145 vk_device_dispatch_table_from_entrypoints(&dispatch_table,
146 &nvk_device_entrypoints, true);
147 vk_device_dispatch_table_from_entrypoints(&dispatch_table,
148 &wsi_device_entrypoints, false);
149
150 result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table,
151 pCreateInfo, pAllocator);
152 if (result != VK_SUCCESS)
153 goto fail_alloc;
154
155 dev->vk.shader_ops = &nvk_device_shader_ops;
156
157 result = nvkmd_pdev_create_dev(pdev->nvkmd, &pdev->vk.base, &dev->nvkmd);
158 if (result != VK_SUCCESS)
159 goto fail_init;
160
161 vk_device_set_drm_fd(&dev->vk, nvkmd_dev_get_drm_fd(dev->nvkmd));
162 dev->vk.command_buffer_ops = &nvk_cmd_buffer_ops;
163
164 dev->vk.get_timestamp = nvk_device_get_timestamp;
165
166 result = nvk_upload_queue_init(dev, &dev->upload);
167 if (result != VK_SUCCESS)
168 goto fail_nvkmd;
169
170 result = nvkmd_dev_alloc_mapped_mem(dev->nvkmd, &pdev->vk.base,
171 0x1000, 0, NVKMD_MEM_LOCAL,
172 NVKMD_MEM_MAP_WR, &dev->zero_page);
173 if (result != VK_SUCCESS)
174 goto fail_upload;
175
176 memset(dev->zero_page->map, 0, 0x1000);
177 nvkmd_mem_unmap(dev->zero_page, 0);
178
179 result = nvk_descriptor_table_init(dev, &dev->images,
180 8 * 4 /* tic entry size */,
181 1024, 1024 * 1024);
182 if (result != VK_SUCCESS)
183 goto fail_zero_page;
184
185 /* Reserve the descriptor at offset 0 to be the null descriptor */
186 uint32_t null_tic[8] = { 0, };
187 nil_fill_null_tic(&pdev->info, dev->zero_page->va->addr, &null_tic);
188
189 ASSERTED uint32_t null_image_index;
190 result = nvk_descriptor_table_add(dev, &dev->images,
191 null_tic, sizeof(null_tic),
192 &null_image_index);
193 assert(result == VK_SUCCESS);
194 assert(null_image_index == 0);
195
196 result = nvk_descriptor_table_init(dev, &dev->samplers,
197 8 * 4 /* tsc entry size */,
198 4096, 4096);
199 if (result != VK_SUCCESS)
200 goto fail_images;
201
202 if (dev->vk.enabled_features.descriptorBuffer ||
203 nvk_use_edb_buffer_views(pdev)) {
204 result = nvk_edb_bview_cache_init(dev, &dev->edb_bview_cache);
205 if (result != VK_SUCCESS)
206 goto fail_samplers;
207 }
208
209 /* If we have a full BAR, go ahead and do shader uploads on the CPU.
210 * Otherwise, we fall back to doing shader uploads via the upload queue.
211 *
212 * Also, the I-cache pre-fetches and NVIDIA has informed us
213 * overallocating shaders BOs by 2K is sufficient.
214 */
215 enum nvkmd_mem_map_flags shader_map_flags = 0;
216 if (pdev->info.bar_size_B >= pdev->info.vram_size_B)
217 shader_map_flags = NVKMD_MEM_MAP_WR;
218 result = nvk_heap_init(dev, &dev->shader_heap,
219 NVKMD_MEM_LOCAL, shader_map_flags,
220 2048 /* overalloc */,
221 pdev->info.cls_eng3d < VOLTA_A);
222 if (result != VK_SUCCESS)
223 goto fail_edb_bview_cache;
224
225 result = nvk_heap_init(dev, &dev->event_heap,
226 NVKMD_MEM_LOCAL, NVKMD_MEM_MAP_WR,
227 0 /* overalloc */, false /* contiguous */);
228 if (result != VK_SUCCESS)
229 goto fail_shader_heap;
230
231 nvk_slm_area_init(&dev->slm);
232
233 if (pdev->info.cls_eng3d >= FERMI_A &&
234 pdev->info.cls_eng3d < MAXWELL_A) {
235 /* max size is 256k */
236 result = nvkmd_dev_alloc_mem(dev->nvkmd, &pdev->vk.base,
237 1 << 17, 1 << 20, NVKMD_MEM_LOCAL,
238 &dev->vab_memory);
239 if (result != VK_SUCCESS)
240 goto fail_slm;
241 }
242
243 result = nvk_queue_init(dev, &dev->queue,
244 &pCreateInfo->pQueueCreateInfos[0], 0);
245 if (result != VK_SUCCESS)
246 goto fail_vab_memory;
247
248 struct vk_pipeline_cache_create_info cache_info = {
249 .weak_ref = true,
250 };
251 dev->vk.mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
252 if (dev->vk.mem_cache == NULL) {
253 result = VK_ERROR_OUT_OF_HOST_MEMORY;
254 goto fail_queue;
255 }
256
257 result = nvk_device_init_meta(dev);
258 if (result != VK_SUCCESS)
259 goto fail_mem_cache;
260
261 *pDevice = nvk_device_to_handle(dev);
262
263 return VK_SUCCESS;
264
265 fail_mem_cache:
266 vk_pipeline_cache_destroy(dev->vk.mem_cache, NULL);
267 fail_queue:
268 nvk_queue_finish(dev, &dev->queue);
269 fail_vab_memory:
270 if (dev->vab_memory)
271 nvkmd_mem_unref(dev->vab_memory);
272 fail_slm:
273 nvk_slm_area_finish(&dev->slm);
274 nvk_heap_finish(dev, &dev->event_heap);
275 fail_shader_heap:
276 nvk_heap_finish(dev, &dev->shader_heap);
277 fail_edb_bview_cache:
278 nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache);
279 fail_samplers:
280 nvk_descriptor_table_finish(dev, &dev->samplers);
281 fail_images:
282 nvk_descriptor_table_finish(dev, &dev->images);
283 fail_zero_page:
284 nvkmd_mem_unref(dev->zero_page);
285 fail_upload:
286 nvk_upload_queue_finish(dev, &dev->upload);
287 fail_nvkmd:
288 nvkmd_dev_destroy(dev->nvkmd);
289 fail_init:
290 vk_device_finish(&dev->vk);
291 fail_alloc:
292 vk_free(&dev->vk.alloc, dev);
293 return result;
294 }
295
296 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)297 nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
298 {
299 VK_FROM_HANDLE(nvk_device, dev, _device);
300
301 if (!dev)
302 return;
303
304 if (dev->copy_queries)
305 vk_shader_destroy(&dev->vk, &dev->copy_queries->vk, &dev->vk.alloc);
306
307 nvk_device_finish_meta(dev);
308
309 vk_pipeline_cache_destroy(dev->vk.mem_cache, NULL);
310 nvk_queue_finish(dev, &dev->queue);
311 if (dev->vab_memory)
312 nvkmd_mem_unref(dev->vab_memory);
313 vk_device_finish(&dev->vk);
314
315 /* Idle the upload queue before we tear down heaps */
316 nvk_upload_queue_sync(dev, &dev->upload);
317
318 nvk_slm_area_finish(&dev->slm);
319 nvk_heap_finish(dev, &dev->event_heap);
320 nvk_heap_finish(dev, &dev->shader_heap);
321 nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache);
322 nvk_descriptor_table_finish(dev, &dev->samplers);
323 nvk_descriptor_table_finish(dev, &dev->images);
324 nvkmd_mem_unref(dev->zero_page);
325 nvk_upload_queue_finish(dev, &dev->upload);
326 nvkmd_dev_destroy(dev->nvkmd);
327 vk_free(&dev->vk.alloc, dev);
328 }
329
330 VkResult
nvk_device_ensure_slm(struct nvk_device * dev,uint32_t slm_bytes_per_lane,uint32_t crs_bytes_per_warp)331 nvk_device_ensure_slm(struct nvk_device *dev,
332 uint32_t slm_bytes_per_lane,
333 uint32_t crs_bytes_per_warp)
334 {
335 return nvk_slm_area_ensure(dev, &dev->slm,
336 slm_bytes_per_lane,
337 crs_bytes_per_warp);
338 }
339