1 /*
2 * Copyright 2024 Valve Corporation
3 * Copyright 2024 Alyssa Rosenzweig
4 * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5 * SPDX-License-Identifier: MIT
6 */
7 #include "hk_device.h"
8
9 #include "agx_bg_eot.h"
10 #include "agx_helpers.h"
11 #include "agx_scratch.h"
12 #include "hk_cmd_buffer.h"
13 #include "hk_descriptor_table.h"
14 #include "hk_entrypoints.h"
15 #include "hk_instance.h"
16 #include "hk_physical_device.h"
17 #include "hk_shader.h"
18
19 #include "asahi/genxml/agx_pack.h"
20 #include "asahi/lib/agx_bo.h"
21 #include "asahi/lib/agx_device.h"
22 #include "asahi/libagx/geometry.h"
23 #include "util/hash_table.h"
24 #include "util/ralloc.h"
25 #include "util/simple_mtx.h"
26 #include "vulkan/vulkan_core.h"
27 #include "vulkan/wsi/wsi_common.h"
28 #include "vk_cmd_enqueue_entrypoints.h"
29 #include "vk_common_entrypoints.h"
30 #include "vk_debug_utils.h"
31 #include "vk_device.h"
32 #include "vk_pipeline_cache.h"
33
34 #include <fcntl.h>
35 #include <xf86drm.h>
36
37 /* clang-format off */
38 static const struct debug_named_value hk_perf_options[] = {
39 {"notess", HK_PERF_NOTESS, "Skip draws with tessellation"},
40 {"noborder", HK_PERF_NOBORDER, "Disable custom border colour emulation"},
41 {"nobarrier", HK_PERF_NOBARRIER,"Ignore pipeline barriers"},
42 {"batch", HK_PERF_BATCH, "Batch submissions"},
43 {"norobust", HK_PERF_NOROBUST, "Disable robustness"},
44 DEBUG_NAMED_VALUE_END
45 };
46 /* clang-format on */
47
48 /*
49 * We preupload some constants so we can cheaply reference later without extra
50 * allocation and copying.
51 *
52 * TODO: This is small, don't waste a whole BO.
53 */
54 static VkResult
hk_upload_rodata(struct hk_device * dev)55 hk_upload_rodata(struct hk_device *dev)
56 {
57 dev->rodata.bo =
58 agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, 0, "Read only data");
59
60 if (!dev->rodata.bo)
61 return VK_ERROR_OUT_OF_HOST_MEMORY;
62
63 uint8_t *map = agx_bo_map(dev->rodata.bo);
64 uint32_t offs = 0;
65
66 offs = align(offs, 8);
67 agx_pack(&dev->dev.txf_sampler, USC_SAMPLER, cfg) {
68 cfg.start = 0;
69 cfg.count = 1;
70 cfg.buffer = dev->rodata.bo->va->addr + offs;
71 }
72
73 agx_pack_txf_sampler((struct agx_sampler_packed *)(map + offs));
74 offs += AGX_SAMPLER_LENGTH;
75
76 /* The image heap is allocated on the device prior to the rodata. The heap
77 * lives as long as the device does and has a stable address (requiring
78 * sparse binding to grow dynamically). That means its address is effectively
79 * rodata and can be uploaded now. agx_usc_uniform requires an indirection to
80 * push the heap address, so this takes care of that indirection up front to
81 * cut an alloc/upload at draw time.
82 */
83 offs = align(offs, sizeof(uint64_t));
84 agx_pack(&dev->rodata.image_heap, USC_UNIFORM, cfg) {
85 cfg.start_halfs = HK_IMAGE_HEAP_UNIFORM;
86 cfg.size_halfs = 4;
87 cfg.buffer = dev->rodata.bo->va->addr + offs;
88 }
89
90 uint64_t *image_heap_ptr = (void *)map + offs;
91 *image_heap_ptr = dev->images.bo->va->addr;
92 offs += sizeof(uint64_t);
93
94 /* The geometry state buffer isn't strictly readonly data, but we only have a
95 * single instance of it device-wide and -- after initializing at heap
96 * allocate time -- it is read-only from the CPU perspective. The GPU uses it
97 * for scratch, but is required to reset it after use to ensure resubmitting
98 * the same command buffer works.
99 *
100 * So, we allocate it here for convenience.
101 */
102 offs = align(offs, sizeof(uint64_t));
103 dev->rodata.geometry_state = dev->rodata.bo->va->addr + offs;
104 offs += sizeof(struct agx_geometry_state);
105
106 /* For null readonly buffers, we need to allocate 16 bytes of zeroes for
107 * robustness2 semantics on read.
108 */
109 offs = align(offs, 16);
110 dev->rodata.zero_sink = dev->rodata.bo->va->addr + offs;
111 memset(map + offs, 0, 16);
112 offs += 16;
113
114 /* For null storage descriptors, we need to reserve 16 bytes to catch writes.
115 * No particular content is required; we cannot get robustness2 semantics
116 * without more work.
117 */
118 offs = align(offs, 16);
119 dev->rodata.null_sink = dev->rodata.bo->va->addr + offs;
120 offs += 16;
121
122 return VK_SUCCESS;
123 }
124
125 static uint32_t
internal_key_hash(const void * key_)126 internal_key_hash(const void *key_)
127 {
128 const struct hk_internal_key *key = key_;
129
130 return _mesa_hash_data(key, sizeof(struct hk_internal_key) + key->key_size);
131 }
132
133 static bool
internal_key_equal(const void * a_,const void * b_)134 internal_key_equal(const void *a_, const void *b_)
135 {
136 const struct hk_internal_key *a = a_;
137 const struct hk_internal_key *b = b_;
138
139 return a->builder == b->builder && a->key_size == b->key_size &&
140 memcmp(a->key, b->key, a->key_size) == 0;
141 }
142
143 static VkResult
hk_init_internal_shaders(struct hk_internal_shaders * s)144 hk_init_internal_shaders(struct hk_internal_shaders *s)
145 {
146 s->ht = _mesa_hash_table_create(NULL, internal_key_hash, internal_key_equal);
147 if (!s->ht)
148 return VK_ERROR_OUT_OF_HOST_MEMORY;
149
150 simple_mtx_init(&s->lock, mtx_plain);
151 return VK_SUCCESS;
152 }
153
154 static void
hk_destroy_internal_shaders(struct hk_device * dev,struct hk_internal_shaders * s,bool part)155 hk_destroy_internal_shaders(struct hk_device *dev,
156 struct hk_internal_shaders *s, bool part)
157 {
158 hash_table_foreach(s->ht, ent) {
159 if (part) {
160 struct agx_shader_part *part = ent->data;
161 free(part->binary);
162
163 /* The agx_shader_part itself is ralloc'd against the hash table so
164 * will be freed.
165 */
166 } else {
167 struct hk_api_shader *obj = ent->data;
168 hk_api_shader_destroy(&dev->vk, &obj->vk, NULL);
169 }
170 }
171
172 _mesa_hash_table_destroy(s->ht, NULL);
173 simple_mtx_destroy(&s->lock);
174 }
175
176 DERIVE_HASH_TABLE(agx_sampler_packed);
177
178 static VkResult
hk_init_sampler_heap(struct hk_device * dev,struct hk_sampler_heap * h)179 hk_init_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
180 {
181 h->ht = agx_sampler_packed_table_create(NULL);
182 if (!h->ht)
183 return VK_ERROR_OUT_OF_HOST_MEMORY;
184
185 VkResult result =
186 hk_descriptor_table_init(dev, &h->table, AGX_SAMPLER_LENGTH, 1024, 1024);
187
188 if (result != VK_SUCCESS) {
189 ralloc_free(h->ht);
190 return result;
191 }
192
193 simple_mtx_init(&h->lock, mtx_plain);
194 return VK_SUCCESS;
195 }
196
197 static void
hk_destroy_sampler_heap(struct hk_device * dev,struct hk_sampler_heap * h)198 hk_destroy_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
199 {
200 hk_descriptor_table_finish(dev, &h->table);
201 ralloc_free(h->ht);
202 simple_mtx_destroy(&h->lock);
203 }
204
205 static VkResult
hk_sampler_heap_add_locked(struct hk_device * dev,struct hk_sampler_heap * h,struct agx_sampler_packed desc,struct hk_rc_sampler ** out)206 hk_sampler_heap_add_locked(struct hk_device *dev, struct hk_sampler_heap *h,
207 struct agx_sampler_packed desc,
208 struct hk_rc_sampler **out)
209 {
210 struct hash_entry *ent = _mesa_hash_table_search(h->ht, &desc);
211 if (ent != NULL) {
212 *out = ent->data;
213
214 assert((*out)->refcount != 0);
215 (*out)->refcount++;
216
217 return VK_SUCCESS;
218 }
219
220 struct hk_rc_sampler *rc = ralloc(h->ht, struct hk_rc_sampler);
221 if (!rc)
222 return VK_ERROR_OUT_OF_HOST_MEMORY;
223
224 uint32_t index;
225 VkResult result =
226 hk_descriptor_table_add(dev, &h->table, &desc, sizeof(desc), &index);
227 if (result != VK_SUCCESS) {
228 ralloc_free(rc);
229 return result;
230 }
231
232 *rc = (struct hk_rc_sampler){
233 .key = desc,
234 .refcount = 1,
235 .index = index,
236 };
237
238 _mesa_hash_table_insert(h->ht, &rc->key, rc);
239 *out = rc;
240
241 return VK_SUCCESS;
242 }
243
244 VkResult
hk_sampler_heap_add(struct hk_device * dev,struct agx_sampler_packed desc,struct hk_rc_sampler ** out)245 hk_sampler_heap_add(struct hk_device *dev, struct agx_sampler_packed desc,
246 struct hk_rc_sampler **out)
247 {
248 struct hk_sampler_heap *h = &dev->samplers;
249
250 simple_mtx_lock(&h->lock);
251 VkResult result = hk_sampler_heap_add_locked(dev, h, desc, out);
252 simple_mtx_unlock(&h->lock);
253
254 return result;
255 }
256
257 static void
hk_sampler_heap_remove_locked(struct hk_device * dev,struct hk_sampler_heap * h,struct hk_rc_sampler * rc)258 hk_sampler_heap_remove_locked(struct hk_device *dev, struct hk_sampler_heap *h,
259 struct hk_rc_sampler *rc)
260 {
261 assert(rc->refcount != 0);
262 rc->refcount--;
263
264 if (rc->refcount == 0) {
265 hk_descriptor_table_remove(dev, &h->table, rc->index);
266 _mesa_hash_table_remove_key(h->ht, &rc->key);
267 ralloc_free(rc);
268 }
269 }
270
271 void
hk_sampler_heap_remove(struct hk_device * dev,struct hk_rc_sampler * rc)272 hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc)
273 {
274 struct hk_sampler_heap *h = &dev->samplers;
275
276 simple_mtx_lock(&h->lock);
277 hk_sampler_heap_remove_locked(dev, h, rc);
278 simple_mtx_unlock(&h->lock);
279 }
280
281 static VkResult
hk_check_status(struct vk_device * device)282 hk_check_status(struct vk_device *device)
283 {
284 struct hk_device *dev = container_of(device, struct hk_device, vk);
285 return vk_check_printf_status(&dev->vk, &dev->dev.printf);
286 }
287
288 static VkResult
hk_get_timestamp(struct vk_device * device,uint64_t * timestamp)289 hk_get_timestamp(struct vk_device *device, uint64_t *timestamp)
290 {
291 struct hk_device *dev = container_of(device, struct hk_device, vk);
292 unreachable("todo");
293 // *timestamp = agx_get_gpu_timestamp(dev);
294 return VK_SUCCESS;
295 }
296
297 /*
298 * To implement nullDescriptor, the descriptor set code will reference
299 * preuploaded null descriptors at fixed offsets in the image heap. Here we
300 * upload those descriptors, initializing the image heap.
301 */
302 static void
hk_upload_null_descriptors(struct hk_device * dev)303 hk_upload_null_descriptors(struct hk_device *dev)
304 {
305 struct agx_texture_packed null_tex;
306 struct agx_pbe_packed null_pbe;
307 uint32_t offset_tex, offset_pbe;
308
309 agx_set_null_texture(&null_tex, dev->rodata.null_sink);
310 agx_set_null_pbe(&null_pbe, dev->rodata.null_sink);
311
312 hk_descriptor_table_add(dev, &dev->images, &null_tex, sizeof(null_tex),
313 &offset_tex);
314
315 hk_descriptor_table_add(dev, &dev->images, &null_pbe, sizeof(null_pbe),
316 &offset_pbe);
317
318 assert((offset_tex * HK_IMAGE_STRIDE) == HK_NULL_TEX_OFFSET && "static");
319 assert((offset_pbe * HK_IMAGE_STRIDE) == HK_NULL_PBE_OFFSET && "static");
320 }
321
322 VKAPI_ATTR VkResult VKAPI_CALL
hk_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)323 hk_CreateDevice(VkPhysicalDevice physicalDevice,
324 const VkDeviceCreateInfo *pCreateInfo,
325 const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
326 {
327 VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
328 VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
329 struct hk_device *dev;
330 struct hk_instance *instance = (struct hk_instance *)pdev->vk.instance;
331
332 dev = vk_zalloc2(&instance->vk.alloc, pAllocator, sizeof(*dev), 8,
333 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
334 if (!dev)
335 return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
336
337 struct vk_device_dispatch_table dispatch_table;
338
339 /* For secondary command buffer support, overwrite any command entrypoints
340 * in the main device-level dispatch table with
341 * vk_cmd_enqueue_unless_primary_Cmd*.
342 */
343 vk_device_dispatch_table_from_entrypoints(
344 &dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true);
345
346 vk_device_dispatch_table_from_entrypoints(&dispatch_table,
347 &hk_device_entrypoints, false);
348 vk_device_dispatch_table_from_entrypoints(&dispatch_table,
349 &wsi_device_entrypoints, false);
350
351 /* Populate primary cmd_dispatch table */
352 vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
353 &hk_device_entrypoints, true);
354 vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
355 &wsi_device_entrypoints, false);
356 vk_device_dispatch_table_from_entrypoints(
357 &dev->cmd_dispatch, &vk_common_device_entrypoints, false);
358
359 result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo,
360 pAllocator);
361 if (result != VK_SUCCESS)
362 goto fail_alloc;
363
364 dev->vk.shader_ops = &hk_device_shader_ops;
365 dev->vk.command_dispatch_table = &dev->cmd_dispatch;
366
367 drmDevicePtr drm_device = NULL;
368 int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device);
369 if (ret != 0) {
370 result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
371 "Failed to get DRM device: %m");
372 goto fail_init;
373 }
374
375 const char *path = drm_device->nodes[DRM_NODE_RENDER];
376 dev->dev.fd = open(path, O_RDWR | O_CLOEXEC);
377 if (dev->dev.fd < 0) {
378 drmFreeDevice(&drm_device);
379 result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
380 "failed to open device %s", path);
381 goto fail_init;
382 }
383
384 dev->perftest = debug_get_flags_option("HK_PERFTEST", hk_perf_options, 0);
385
386 if (instance->no_border) {
387 dev->perftest |= HK_PERF_NOBORDER;
388 }
389
390 if (HK_PERF(dev, NOROBUST)) {
391 dev->vk.enabled_features.robustBufferAccess = false;
392 dev->vk.enabled_features.robustBufferAccess2 = false;
393 dev->vk.enabled_features.robustImageAccess = false;
394 dev->vk.enabled_features.robustImageAccess2 = false;
395 dev->vk.enabled_features.pipelineRobustness = false;
396 }
397
398 bool succ = agx_open_device(NULL, &dev->dev);
399 drmFreeDevice(&drm_device);
400 if (!succ) {
401 result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
402 "Failed to get DRM device: %m");
403 goto fail_fd;
404 }
405
406 vk_device_set_drm_fd(&dev->vk, dev->dev.fd);
407 dev->vk.command_buffer_ops = &hk_cmd_buffer_ops;
408 dev->vk.check_status = hk_check_status;
409 dev->vk.get_timestamp = hk_get_timestamp;
410
411 result = hk_descriptor_table_init(dev, &dev->images, AGX_TEXTURE_LENGTH,
412 1024, 1024 * 1024);
413 if (result != VK_SUCCESS)
414 goto fail_dev;
415
416 result = hk_init_sampler_heap(dev, &dev->samplers);
417 if (result != VK_SUCCESS)
418 goto fail_images;
419
420 result = hk_descriptor_table_init(
421 dev, &dev->occlusion_queries, sizeof(uint64_t), AGX_MAX_OCCLUSION_QUERIES,
422 AGX_MAX_OCCLUSION_QUERIES);
423 if (result != VK_SUCCESS)
424 goto fail_samplers;
425
426 result = hk_upload_rodata(dev);
427 if (result != VK_SUCCESS)
428 goto fail_queries;
429
430 /* Depends on rodata */
431 hk_upload_null_descriptors(dev);
432
433 /* XXX: error handling, and should this even go on the device? */
434 agx_bg_eot_init(&dev->bg_eot, &dev->dev);
435 if (!dev->bg_eot.ht) {
436 result = VK_ERROR_OUT_OF_HOST_MEMORY;
437 goto fail_rodata;
438 }
439
440 result = hk_init_internal_shaders(&dev->prolog_epilog);
441 if (result != VK_SUCCESS)
442 goto fail_bg_eot;
443
444 result = hk_init_internal_shaders(&dev->kernels);
445 if (result != VK_SUCCESS)
446 goto fail_internal_shaders;
447
448 result =
449 hk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0);
450 if (result != VK_SUCCESS)
451 goto fail_internal_shaders_2;
452
453 struct vk_pipeline_cache_create_info cache_info = {
454 .weak_ref = true,
455 };
456 dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
457 if (dev->mem_cache == NULL) {
458 result = VK_ERROR_OUT_OF_HOST_MEMORY;
459 goto fail_queue;
460 }
461
462 result = hk_device_init_meta(dev);
463 if (result != VK_SUCCESS)
464 goto fail_mem_cache;
465
466 *pDevice = hk_device_to_handle(dev);
467
468 simple_mtx_init(&dev->scratch.lock, mtx_plain);
469 agx_scratch_init(&dev->dev, &dev->scratch.vs);
470 agx_scratch_init(&dev->dev, &dev->scratch.fs);
471 agx_scratch_init(&dev->dev, &dev->scratch.cs);
472
473 u_rwlock_init(&dev->external_bos.lock);
474 util_dynarray_init(&dev->external_bos.counts, NULL);
475 util_dynarray_init(&dev->external_bos.list, NULL);
476
477 return VK_SUCCESS;
478
479 fail_mem_cache:
480 vk_pipeline_cache_destroy(dev->mem_cache, NULL);
481 fail_queue:
482 hk_queue_finish(dev, &dev->queue);
483 fail_rodata:
484 agx_bo_unreference(&dev->dev, dev->rodata.bo);
485 fail_bg_eot:
486 agx_bg_eot_cleanup(&dev->bg_eot);
487 fail_internal_shaders_2:
488 hk_destroy_internal_shaders(dev, &dev->kernels, false);
489 fail_internal_shaders:
490 hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
491 fail_queries:
492 hk_descriptor_table_finish(dev, &dev->occlusion_queries);
493 fail_samplers:
494 hk_destroy_sampler_heap(dev, &dev->samplers);
495 fail_images:
496 hk_descriptor_table_finish(dev, &dev->images);
497 fail_dev:
498 agx_close_device(&dev->dev);
499 fail_fd:
500 close(dev->dev.fd);
501 fail_init:
502 vk_device_finish(&dev->vk);
503 fail_alloc:
504 vk_free(&dev->vk.alloc, dev);
505 return result;
506 }
507
508 VKAPI_ATTR void VKAPI_CALL
hk_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)509 hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
510 {
511 VK_FROM_HANDLE(hk_device, dev, _device);
512
513 if (!dev)
514 return;
515
516 util_dynarray_fini(&dev->external_bos.counts);
517 util_dynarray_fini(&dev->external_bos.list);
518 u_rwlock_destroy(&dev->external_bos.lock);
519
520 hk_device_finish_meta(dev);
521 hk_destroy_internal_shaders(dev, &dev->kernels, false);
522 hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
523
524 vk_pipeline_cache_destroy(dev->mem_cache, NULL);
525 hk_queue_finish(dev, &dev->queue);
526 vk_device_finish(&dev->vk);
527
528 agx_scratch_fini(&dev->scratch.vs);
529 agx_scratch_fini(&dev->scratch.fs);
530 agx_scratch_fini(&dev->scratch.cs);
531 simple_mtx_destroy(&dev->scratch.lock);
532
533 hk_destroy_sampler_heap(dev, &dev->samplers);
534 hk_descriptor_table_finish(dev, &dev->images);
535 hk_descriptor_table_finish(dev, &dev->occlusion_queries);
536 agx_bo_unreference(&dev->dev, dev->rodata.bo);
537 agx_bo_unreference(&dev->dev, dev->heap);
538 agx_bg_eot_cleanup(&dev->bg_eot);
539 agx_close_device(&dev->dev);
540 vk_free(&dev->vk.alloc, dev);
541 }
542