• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include "hk_device.h"
8 
9 #include "agx_bg_eot.h"
10 #include "agx_helpers.h"
11 #include "agx_scratch.h"
12 #include "hk_cmd_buffer.h"
13 #include "hk_descriptor_table.h"
14 #include "hk_entrypoints.h"
15 #include "hk_instance.h"
16 #include "hk_physical_device.h"
17 #include "hk_shader.h"
18 
19 #include "asahi/genxml/agx_pack.h"
20 #include "asahi/lib/agx_bo.h"
21 #include "asahi/lib/agx_device.h"
22 #include "asahi/libagx/geometry.h"
23 #include "util/hash_table.h"
24 #include "util/ralloc.h"
25 #include "util/simple_mtx.h"
26 #include "vulkan/vulkan_core.h"
27 #include "vulkan/wsi/wsi_common.h"
28 #include "vk_cmd_enqueue_entrypoints.h"
29 #include "vk_common_entrypoints.h"
30 #include "vk_debug_utils.h"
31 #include "vk_device.h"
32 #include "vk_pipeline_cache.h"
33 
34 #include <fcntl.h>
35 #include <xf86drm.h>
36 
37 /* clang-format off */
38 static const struct debug_named_value hk_perf_options[] = {
39    {"notess",    HK_PERF_NOTESS,   "Skip draws with tessellation"},
40    {"noborder",  HK_PERF_NOBORDER, "Disable custom border colour emulation"},
41    {"nobarrier", HK_PERF_NOBARRIER,"Ignore pipeline barriers"},
42    {"batch",     HK_PERF_BATCH,    "Batch submissions"},
43    {"norobust",  HK_PERF_NOROBUST, "Disable robustness"},
44    DEBUG_NAMED_VALUE_END
45 };
46 /* clang-format on */
47 
48 /*
49  * We preupload some constants so we can cheaply reference later without extra
50  * allocation and copying.
51  *
52  * TODO: This is small, don't waste a whole BO.
53  */
54 static VkResult
hk_upload_rodata(struct hk_device * dev)55 hk_upload_rodata(struct hk_device *dev)
56 {
57    dev->rodata.bo =
58       agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, 0, "Read only data");
59 
60    if (!dev->rodata.bo)
61       return VK_ERROR_OUT_OF_HOST_MEMORY;
62 
63    uint8_t *map = agx_bo_map(dev->rodata.bo);
64    uint32_t offs = 0;
65 
66    offs = align(offs, 8);
67    agx_pack(&dev->dev.txf_sampler, USC_SAMPLER, cfg) {
68       cfg.start = 0;
69       cfg.count = 1;
70       cfg.buffer = dev->rodata.bo->va->addr + offs;
71    }
72 
73    agx_pack_txf_sampler((struct agx_sampler_packed *)(map + offs));
74    offs += AGX_SAMPLER_LENGTH;
75 
76    /* The image heap is allocated on the device prior to the rodata. The heap
77     * lives as long as the device does and has a stable address (requiring
78     * sparse binding to grow dynamically). That means its address is effectively
79     * rodata and can be uploaded now. agx_usc_uniform requires an indirection to
80     * push the heap address, so this takes care of that indirection up front to
81     * cut an alloc/upload at draw time.
82     */
83    offs = align(offs, sizeof(uint64_t));
84    agx_pack(&dev->rodata.image_heap, USC_UNIFORM, cfg) {
85       cfg.start_halfs = HK_IMAGE_HEAP_UNIFORM;
86       cfg.size_halfs = 4;
87       cfg.buffer = dev->rodata.bo->va->addr + offs;
88    }
89 
90    uint64_t *image_heap_ptr = (void *)map + offs;
91    *image_heap_ptr = dev->images.bo->va->addr;
92    offs += sizeof(uint64_t);
93 
94    /* The geometry state buffer isn't strictly readonly data, but we only have a
95     * single instance of it device-wide and -- after initializing at heap
96     * allocate time -- it is read-only from the CPU perspective. The GPU uses it
97     * for scratch, but is required to reset it after use to ensure resubmitting
98     * the same command buffer works.
99     *
100     * So, we allocate it here for convenience.
101     */
102    offs = align(offs, sizeof(uint64_t));
103    dev->rodata.geometry_state = dev->rodata.bo->va->addr + offs;
104    offs += sizeof(struct agx_geometry_state);
105 
106    /* For null readonly buffers, we need to allocate 16 bytes of zeroes for
107     * robustness2 semantics on read.
108     */
109    offs = align(offs, 16);
110    dev->rodata.zero_sink = dev->rodata.bo->va->addr + offs;
111    memset(map + offs, 0, 16);
112    offs += 16;
113 
114    /* For null storage descriptors, we need to reserve 16 bytes to catch writes.
115     * No particular content is required; we cannot get robustness2 semantics
116     * without more work.
117     */
118    offs = align(offs, 16);
119    dev->rodata.null_sink = dev->rodata.bo->va->addr + offs;
120    offs += 16;
121 
122    return VK_SUCCESS;
123 }
124 
125 static uint32_t
internal_key_hash(const void * key_)126 internal_key_hash(const void *key_)
127 {
128    const struct hk_internal_key *key = key_;
129 
130    return _mesa_hash_data(key, sizeof(struct hk_internal_key) + key->key_size);
131 }
132 
133 static bool
internal_key_equal(const void * a_,const void * b_)134 internal_key_equal(const void *a_, const void *b_)
135 {
136    const struct hk_internal_key *a = a_;
137    const struct hk_internal_key *b = b_;
138 
139    return a->builder == b->builder && a->key_size == b->key_size &&
140           memcmp(a->key, b->key, a->key_size) == 0;
141 }
142 
143 static VkResult
hk_init_internal_shaders(struct hk_internal_shaders * s)144 hk_init_internal_shaders(struct hk_internal_shaders *s)
145 {
146    s->ht = _mesa_hash_table_create(NULL, internal_key_hash, internal_key_equal);
147    if (!s->ht)
148       return VK_ERROR_OUT_OF_HOST_MEMORY;
149 
150    simple_mtx_init(&s->lock, mtx_plain);
151    return VK_SUCCESS;
152 }
153 
154 static void
hk_destroy_internal_shaders(struct hk_device * dev,struct hk_internal_shaders * s,bool part)155 hk_destroy_internal_shaders(struct hk_device *dev,
156                             struct hk_internal_shaders *s, bool part)
157 {
158    hash_table_foreach(s->ht, ent) {
159       if (part) {
160          struct agx_shader_part *part = ent->data;
161          free(part->binary);
162 
163          /* The agx_shader_part itself is ralloc'd against the hash table so
164           * will be freed.
165           */
166       } else {
167          struct hk_api_shader *obj = ent->data;
168          hk_api_shader_destroy(&dev->vk, &obj->vk, NULL);
169       }
170    }
171 
172    _mesa_hash_table_destroy(s->ht, NULL);
173    simple_mtx_destroy(&s->lock);
174 }
175 
176 DERIVE_HASH_TABLE(agx_sampler_packed);
177 
178 static VkResult
hk_init_sampler_heap(struct hk_device * dev,struct hk_sampler_heap * h)179 hk_init_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
180 {
181    h->ht = agx_sampler_packed_table_create(NULL);
182    if (!h->ht)
183       return VK_ERROR_OUT_OF_HOST_MEMORY;
184 
185    VkResult result =
186       hk_descriptor_table_init(dev, &h->table, AGX_SAMPLER_LENGTH, 1024, 1024);
187 
188    if (result != VK_SUCCESS) {
189       ralloc_free(h->ht);
190       return result;
191    }
192 
193    simple_mtx_init(&h->lock, mtx_plain);
194    return VK_SUCCESS;
195 }
196 
197 static void
hk_destroy_sampler_heap(struct hk_device * dev,struct hk_sampler_heap * h)198 hk_destroy_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
199 {
200    hk_descriptor_table_finish(dev, &h->table);
201    ralloc_free(h->ht);
202    simple_mtx_destroy(&h->lock);
203 }
204 
205 static VkResult
hk_sampler_heap_add_locked(struct hk_device * dev,struct hk_sampler_heap * h,struct agx_sampler_packed desc,struct hk_rc_sampler ** out)206 hk_sampler_heap_add_locked(struct hk_device *dev, struct hk_sampler_heap *h,
207                            struct agx_sampler_packed desc,
208                            struct hk_rc_sampler **out)
209 {
210    struct hash_entry *ent = _mesa_hash_table_search(h->ht, &desc);
211    if (ent != NULL) {
212       *out = ent->data;
213 
214       assert((*out)->refcount != 0);
215       (*out)->refcount++;
216 
217       return VK_SUCCESS;
218    }
219 
220    struct hk_rc_sampler *rc = ralloc(h->ht, struct hk_rc_sampler);
221    if (!rc)
222       return VK_ERROR_OUT_OF_HOST_MEMORY;
223 
224    uint32_t index;
225    VkResult result =
226       hk_descriptor_table_add(dev, &h->table, &desc, sizeof(desc), &index);
227    if (result != VK_SUCCESS) {
228       ralloc_free(rc);
229       return result;
230    }
231 
232    *rc = (struct hk_rc_sampler){
233       .key = desc,
234       .refcount = 1,
235       .index = index,
236    };
237 
238    _mesa_hash_table_insert(h->ht, &rc->key, rc);
239    *out = rc;
240 
241    return VK_SUCCESS;
242 }
243 
244 VkResult
hk_sampler_heap_add(struct hk_device * dev,struct agx_sampler_packed desc,struct hk_rc_sampler ** out)245 hk_sampler_heap_add(struct hk_device *dev, struct agx_sampler_packed desc,
246                     struct hk_rc_sampler **out)
247 {
248    struct hk_sampler_heap *h = &dev->samplers;
249 
250    simple_mtx_lock(&h->lock);
251    VkResult result = hk_sampler_heap_add_locked(dev, h, desc, out);
252    simple_mtx_unlock(&h->lock);
253 
254    return result;
255 }
256 
257 static void
hk_sampler_heap_remove_locked(struct hk_device * dev,struct hk_sampler_heap * h,struct hk_rc_sampler * rc)258 hk_sampler_heap_remove_locked(struct hk_device *dev, struct hk_sampler_heap *h,
259                               struct hk_rc_sampler *rc)
260 {
261    assert(rc->refcount != 0);
262    rc->refcount--;
263 
264    if (rc->refcount == 0) {
265       hk_descriptor_table_remove(dev, &h->table, rc->index);
266       _mesa_hash_table_remove_key(h->ht, &rc->key);
267       ralloc_free(rc);
268    }
269 }
270 
271 void
hk_sampler_heap_remove(struct hk_device * dev,struct hk_rc_sampler * rc)272 hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc)
273 {
274    struct hk_sampler_heap *h = &dev->samplers;
275 
276    simple_mtx_lock(&h->lock);
277    hk_sampler_heap_remove_locked(dev, h, rc);
278    simple_mtx_unlock(&h->lock);
279 }
280 
281 static VkResult
hk_check_status(struct vk_device * device)282 hk_check_status(struct vk_device *device)
283 {
284    struct hk_device *dev = container_of(device, struct hk_device, vk);
285    return vk_check_printf_status(&dev->vk, &dev->dev.printf);
286 }
287 
288 static VkResult
hk_get_timestamp(struct vk_device * device,uint64_t * timestamp)289 hk_get_timestamp(struct vk_device *device, uint64_t *timestamp)
290 {
291    struct hk_device *dev = container_of(device, struct hk_device, vk);
292    unreachable("todo");
293    // *timestamp = agx_get_gpu_timestamp(dev);
294    return VK_SUCCESS;
295 }
296 
297 /*
298  * To implement nullDescriptor, the descriptor set code will reference
299  * preuploaded null descriptors at fixed offsets in the image heap. Here we
300  * upload those descriptors, initializing the image heap.
301  */
302 static void
hk_upload_null_descriptors(struct hk_device * dev)303 hk_upload_null_descriptors(struct hk_device *dev)
304 {
305    struct agx_texture_packed null_tex;
306    struct agx_pbe_packed null_pbe;
307    uint32_t offset_tex, offset_pbe;
308 
309    agx_set_null_texture(&null_tex, dev->rodata.null_sink);
310    agx_set_null_pbe(&null_pbe, dev->rodata.null_sink);
311 
312    hk_descriptor_table_add(dev, &dev->images, &null_tex, sizeof(null_tex),
313                            &offset_tex);
314 
315    hk_descriptor_table_add(dev, &dev->images, &null_pbe, sizeof(null_pbe),
316                            &offset_pbe);
317 
318    assert((offset_tex * HK_IMAGE_STRIDE) == HK_NULL_TEX_OFFSET && "static");
319    assert((offset_pbe * HK_IMAGE_STRIDE) == HK_NULL_PBE_OFFSET && "static");
320 }
321 
322 VKAPI_ATTR VkResult VKAPI_CALL
hk_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)323 hk_CreateDevice(VkPhysicalDevice physicalDevice,
324                 const VkDeviceCreateInfo *pCreateInfo,
325                 const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
326 {
327    VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
328    VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
329    struct hk_device *dev;
330    struct hk_instance *instance = (struct hk_instance *)pdev->vk.instance;
331 
332    dev = vk_zalloc2(&instance->vk.alloc, pAllocator, sizeof(*dev), 8,
333                     VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
334    if (!dev)
335       return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
336 
337    struct vk_device_dispatch_table dispatch_table;
338 
339    /* For secondary command buffer support, overwrite any command entrypoints
340     * in the main device-level dispatch table with
341     * vk_cmd_enqueue_unless_primary_Cmd*.
342     */
343    vk_device_dispatch_table_from_entrypoints(
344       &dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true);
345 
346    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
347                                              &hk_device_entrypoints, false);
348    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
349                                              &wsi_device_entrypoints, false);
350 
351    /* Populate primary cmd_dispatch table */
352    vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
353                                              &hk_device_entrypoints, true);
354    vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
355                                              &wsi_device_entrypoints, false);
356    vk_device_dispatch_table_from_entrypoints(
357       &dev->cmd_dispatch, &vk_common_device_entrypoints, false);
358 
359    result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo,
360                            pAllocator);
361    if (result != VK_SUCCESS)
362       goto fail_alloc;
363 
364    dev->vk.shader_ops = &hk_device_shader_ops;
365    dev->vk.command_dispatch_table = &dev->cmd_dispatch;
366 
367    drmDevicePtr drm_device = NULL;
368    int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device);
369    if (ret != 0) {
370       result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
371                          "Failed to get DRM device: %m");
372       goto fail_init;
373    }
374 
375    const char *path = drm_device->nodes[DRM_NODE_RENDER];
376    dev->dev.fd = open(path, O_RDWR | O_CLOEXEC);
377    if (dev->dev.fd < 0) {
378       drmFreeDevice(&drm_device);
379       result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
380                          "failed to open device %s", path);
381       goto fail_init;
382    }
383 
384    dev->perftest = debug_get_flags_option("HK_PERFTEST", hk_perf_options, 0);
385 
386    if (instance->no_border) {
387       dev->perftest |= HK_PERF_NOBORDER;
388    }
389 
390    if (HK_PERF(dev, NOROBUST)) {
391       dev->vk.enabled_features.robustBufferAccess = false;
392       dev->vk.enabled_features.robustBufferAccess2 = false;
393       dev->vk.enabled_features.robustImageAccess = false;
394       dev->vk.enabled_features.robustImageAccess2 = false;
395       dev->vk.enabled_features.pipelineRobustness = false;
396    }
397 
398    bool succ = agx_open_device(NULL, &dev->dev);
399    drmFreeDevice(&drm_device);
400    if (!succ) {
401       result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
402                          "Failed to get DRM device: %m");
403       goto fail_fd;
404    }
405 
406    vk_device_set_drm_fd(&dev->vk, dev->dev.fd);
407    dev->vk.command_buffer_ops = &hk_cmd_buffer_ops;
408    dev->vk.check_status = hk_check_status;
409    dev->vk.get_timestamp = hk_get_timestamp;
410 
411    result = hk_descriptor_table_init(dev, &dev->images, AGX_TEXTURE_LENGTH,
412                                      1024, 1024 * 1024);
413    if (result != VK_SUCCESS)
414       goto fail_dev;
415 
416    result = hk_init_sampler_heap(dev, &dev->samplers);
417    if (result != VK_SUCCESS)
418       goto fail_images;
419 
420    result = hk_descriptor_table_init(
421       dev, &dev->occlusion_queries, sizeof(uint64_t), AGX_MAX_OCCLUSION_QUERIES,
422       AGX_MAX_OCCLUSION_QUERIES);
423    if (result != VK_SUCCESS)
424       goto fail_samplers;
425 
426    result = hk_upload_rodata(dev);
427    if (result != VK_SUCCESS)
428       goto fail_queries;
429 
430    /* Depends on rodata */
431    hk_upload_null_descriptors(dev);
432 
433    /* XXX: error handling, and should this even go on the device? */
434    agx_bg_eot_init(&dev->bg_eot, &dev->dev);
435    if (!dev->bg_eot.ht) {
436       result = VK_ERROR_OUT_OF_HOST_MEMORY;
437       goto fail_rodata;
438    }
439 
440    result = hk_init_internal_shaders(&dev->prolog_epilog);
441    if (result != VK_SUCCESS)
442       goto fail_bg_eot;
443 
444    result = hk_init_internal_shaders(&dev->kernels);
445    if (result != VK_SUCCESS)
446       goto fail_internal_shaders;
447 
448    result =
449       hk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0);
450    if (result != VK_SUCCESS)
451       goto fail_internal_shaders_2;
452 
453    struct vk_pipeline_cache_create_info cache_info = {
454       .weak_ref = true,
455    };
456    dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
457    if (dev->mem_cache == NULL) {
458       result = VK_ERROR_OUT_OF_HOST_MEMORY;
459       goto fail_queue;
460    }
461 
462    result = hk_device_init_meta(dev);
463    if (result != VK_SUCCESS)
464       goto fail_mem_cache;
465 
466    *pDevice = hk_device_to_handle(dev);
467 
468    simple_mtx_init(&dev->scratch.lock, mtx_plain);
469    agx_scratch_init(&dev->dev, &dev->scratch.vs);
470    agx_scratch_init(&dev->dev, &dev->scratch.fs);
471    agx_scratch_init(&dev->dev, &dev->scratch.cs);
472 
473    u_rwlock_init(&dev->external_bos.lock);
474    util_dynarray_init(&dev->external_bos.counts, NULL);
475    util_dynarray_init(&dev->external_bos.list, NULL);
476 
477    return VK_SUCCESS;
478 
479 fail_mem_cache:
480    vk_pipeline_cache_destroy(dev->mem_cache, NULL);
481 fail_queue:
482    hk_queue_finish(dev, &dev->queue);
483 fail_rodata:
484    agx_bo_unreference(&dev->dev, dev->rodata.bo);
485 fail_bg_eot:
486    agx_bg_eot_cleanup(&dev->bg_eot);
487 fail_internal_shaders_2:
488    hk_destroy_internal_shaders(dev, &dev->kernels, false);
489 fail_internal_shaders:
490    hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
491 fail_queries:
492    hk_descriptor_table_finish(dev, &dev->occlusion_queries);
493 fail_samplers:
494    hk_destroy_sampler_heap(dev, &dev->samplers);
495 fail_images:
496    hk_descriptor_table_finish(dev, &dev->images);
497 fail_dev:
498    agx_close_device(&dev->dev);
499 fail_fd:
500    close(dev->dev.fd);
501 fail_init:
502    vk_device_finish(&dev->vk);
503 fail_alloc:
504    vk_free(&dev->vk.alloc, dev);
505    return result;
506 }
507 
508 VKAPI_ATTR void VKAPI_CALL
hk_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)509 hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
510 {
511    VK_FROM_HANDLE(hk_device, dev, _device);
512 
513    if (!dev)
514       return;
515 
516    util_dynarray_fini(&dev->external_bos.counts);
517    util_dynarray_fini(&dev->external_bos.list);
518    u_rwlock_destroy(&dev->external_bos.lock);
519 
520    hk_device_finish_meta(dev);
521    hk_destroy_internal_shaders(dev, &dev->kernels, false);
522    hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
523 
524    vk_pipeline_cache_destroy(dev->mem_cache, NULL);
525    hk_queue_finish(dev, &dev->queue);
526    vk_device_finish(&dev->vk);
527 
528    agx_scratch_fini(&dev->scratch.vs);
529    agx_scratch_fini(&dev->scratch.fs);
530    agx_scratch_fini(&dev->scratch.cs);
531    simple_mtx_destroy(&dev->scratch.lock);
532 
533    hk_destroy_sampler_heap(dev, &dev->samplers);
534    hk_descriptor_table_finish(dev, &dev->images);
535    hk_descriptor_table_finish(dev, &dev->occlusion_queries);
536    agx_bo_unreference(&dev->dev, dev->rodata.bo);
537    agx_bo_unreference(&dev->dev, dev->heap);
538    agx_bg_eot_cleanup(&dev->bg_eot);
539    agx_close_device(&dev->dev);
540    vk_free(&dev->vk.alloc, dev);
541 }
542