• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based on amdgpu winsys.
6  * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
7  * Copyright © 2015 Advanced Micro Devices, Inc.
8  *
9  * SPDX-License-Identifier: MIT
10  */
11 
12 #include <stdio.h>
13 
14 #include "radv_amdgpu_bo.h"
15 #include "radv_debug.h"
16 
17 #include <amdgpu.h>
18 #include <inttypes.h>
19 #include <pthread.h>
20 #include <unistd.h>
21 #include <xf86drm.h>
22 #include "drm-uapi/amdgpu_drm.h"
23 #include <sys/mman.h>
24 #include "ac_linux_drm.h"
25 
26 #include "util/os_drm.h"
27 #include "util/os_time.h"
28 #include "util/u_atomic.h"
29 #include "util/u_math.h"
30 #include "util/u_memory.h"
31 
32 static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo);
33 
34 static int
radv_amdgpu_bo_va_op(struct radv_amdgpu_winsys * ws,uint32_t bo_handle,uint64_t offset,uint64_t size,uint64_t addr,uint32_t bo_flags,uint64_t internal_flags,uint32_t ops)35 radv_amdgpu_bo_va_op(struct radv_amdgpu_winsys *ws, uint32_t bo_handle, uint64_t offset, uint64_t size, uint64_t addr,
36                      uint32_t bo_flags, uint64_t internal_flags, uint32_t ops)
37 {
38    uint64_t flags = internal_flags;
39    if (bo_handle) {
40       flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_EXECUTABLE;
41 
42       if ((bo_flags & RADEON_FLAG_VA_UNCACHED) && ws->info.gfx_level >= GFX9)
43          flags |= AMDGPU_VM_MTYPE_UC;
44 
45       if (!(bo_flags & RADEON_FLAG_READ_ONLY))
46          flags |= AMDGPU_VM_PAGE_WRITEABLE;
47    }
48 
49    size = align64(size, getpagesize());
50 
51    return ac_drm_bo_va_op_raw(ws->dev, bo_handle, offset, size, addr, flags, ops);
52 }
53 
54 static int
bo_comparator(const void * ap,const void * bp)55 bo_comparator(const void *ap, const void *bp)
56 {
57    struct radv_amdgpu_bo *a = *(struct radv_amdgpu_bo *const *)ap;
58    struct radv_amdgpu_bo *b = *(struct radv_amdgpu_bo *const *)bp;
59    return (a > b) ? 1 : (a < b) ? -1 : 0;
60 }
61 
62 static VkResult
radv_amdgpu_winsys_rebuild_bo_list(struct radv_amdgpu_winsys_bo * bo)63 radv_amdgpu_winsys_rebuild_bo_list(struct radv_amdgpu_winsys_bo *bo)
64 {
65    u_rwlock_wrlock(&bo->lock);
66 
67    if (bo->bo_capacity < bo->range_count) {
68       uint32_t new_count = MAX2(bo->bo_capacity * 2, bo->range_count);
69       struct radv_amdgpu_winsys_bo **bos = realloc(bo->bos, new_count * sizeof(struct radv_amdgpu_winsys_bo *));
70       if (!bos) {
71          u_rwlock_wrunlock(&bo->lock);
72          return VK_ERROR_OUT_OF_HOST_MEMORY;
73       }
74       bo->bos = bos;
75       bo->bo_capacity = new_count;
76    }
77 
78    uint32_t temp_bo_count = 0;
79    for (uint32_t i = 0; i < bo->range_count; ++i)
80       if (bo->ranges[i].bo)
81          bo->bos[temp_bo_count++] = bo->ranges[i].bo;
82 
83    qsort(bo->bos, temp_bo_count, sizeof(struct radv_amdgpu_winsys_bo *), &bo_comparator);
84 
85    if (!temp_bo_count) {
86       bo->bo_count = 0;
87    } else {
88       uint32_t final_bo_count = 1;
89       for (uint32_t i = 1; i < temp_bo_count; ++i)
90          if (bo->bos[i] != bo->bos[i - 1])
91             bo->bos[final_bo_count++] = bo->bos[i];
92 
93       bo->bo_count = final_bo_count;
94    }
95 
96    u_rwlock_wrunlock(&bo->lock);
97    return VK_SUCCESS;
98 }
99 
100 static void
radv_amdgpu_log_va_op(struct radv_amdgpu_winsys * ws,struct radv_amdgpu_winsys_bo * bo,uint64_t offset,uint64_t size,uint64_t virtual_va)101 radv_amdgpu_log_va_op(struct radv_amdgpu_winsys *ws,
102                       struct radv_amdgpu_winsys_bo *bo, uint64_t offset, uint64_t size,
103                       uint64_t virtual_va)
104 {
105    struct radv_amdgpu_winsys_bo_log *bo_log = NULL;
106 
107    if (!ws->debug_log_bos)
108       return;
109 
110    bo_log = calloc(1, sizeof(*bo_log));
111    if (!bo_log)
112       return;
113 
114    bo_log->va = virtual_va;
115    bo_log->size = size;
116    bo_log->timestamp = os_time_get_nano();
117    bo_log->virtual_mapping = 1;
118    bo_log->mapped_va = bo ? (bo->base.va + offset) : 0;
119 
120    u_rwlock_wrlock(&ws->log_bo_list_lock);
121    list_addtail(&bo_log->list, &ws->log_bo_list);
122    u_rwlock_wrunlock(&ws->log_bo_list_lock);
123 }
124 
125 static VkResult
radv_amdgpu_winsys_bo_virtual_bind(struct radeon_winsys * _ws,struct radeon_winsys_bo * _parent,uint64_t offset,uint64_t size,struct radeon_winsys_bo * _bo,uint64_t bo_offset)126 radv_amdgpu_winsys_bo_virtual_bind(struct radeon_winsys *_ws, struct radeon_winsys_bo *_parent, uint64_t offset,
127                                    uint64_t size, struct radeon_winsys_bo *_bo, uint64_t bo_offset)
128 {
129    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
130    struct radv_amdgpu_winsys_bo *parent = (struct radv_amdgpu_winsys_bo *)_parent;
131    struct radv_amdgpu_winsys_bo *bo = (struct radv_amdgpu_winsys_bo *)_bo;
132    int range_count_delta, new_idx;
133    int first = 0, last;
134    struct radv_amdgpu_map_range new_first, new_last;
135    VkResult result;
136    int r;
137 
138    assert(parent->is_virtual);
139    assert(!bo || !bo->is_virtual);
140 
141    /* When the BO is NULL, AMDGPU will reset the PTE VA range to the initial state. Otherwise, it
142     * will first unmap all existing VA that overlap the requested range and then map.
143     */
144    if (bo) {
145       r =
146          radv_amdgpu_bo_va_op(ws, bo->bo_handle, bo_offset, size, parent->base.va + offset, 0, 0, AMDGPU_VA_OP_REPLACE);
147       radv_amdgpu_log_va_op(ws, bo, bo_offset, size, parent->base.va + offset);
148    } else {
149       r = radv_amdgpu_bo_va_op(ws, 0, 0, size, parent->base.va + offset, 0, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
150       radv_amdgpu_log_va_op(ws, NULL, 0, size, parent->base.va + offset);
151    }
152 
153    if (r) {
154       fprintf(stderr, "radv/amdgpu: Failed to replace a PRT VA region (%d).\n", r);
155       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
156    }
157 
158    /* Do not add the BO to the virtual BO list if it's already in the global list to avoid dangling
159     * BO references because it might have been destroyed without being previously unbound. Resetting
160     * it to NULL clears the old BO ranges if present.
161     *
162     * This is going to be clarified in the Vulkan spec:
163     * https://gitlab.khronos.org/vulkan/vulkan/-/issues/3125
164     *
165     * The issue still exists for non-global BO but it will be addressed later, once we are 100% it's
166     * RADV fault (mostly because the solution looks more complicated).
167     */
168    if (bo && radv_buffer_is_resident(&bo->base)) {
169       bo = NULL;
170       bo_offset = 0;
171    }
172 
173    /* We have at most 2 new ranges (1 by the bind, and another one by splitting a range that
174     * contains the newly bound range). */
175    if (parent->range_capacity - parent->range_count < 2) {
176       uint32_t range_capacity = parent->range_capacity + 2;
177       struct radv_amdgpu_map_range *ranges =
178          realloc(parent->ranges, range_capacity * sizeof(struct radv_amdgpu_map_range));
179       if (!ranges)
180          return VK_ERROR_OUT_OF_HOST_MEMORY;
181       parent->ranges = ranges;
182       parent->range_capacity = range_capacity;
183    }
184 
185    /*
186     * [first, last] is exactly the range of ranges that either overlap the
187     * new parent, or are adjacent to it. This corresponds to the bind ranges
188     * that may change.
189     */
190    while (first + 1 < parent->range_count && parent->ranges[first].offset + parent->ranges[first].size < offset)
191       ++first;
192 
193    last = first;
194    while (last + 1 < parent->range_count && parent->ranges[last + 1].offset <= offset + size)
195       ++last;
196 
197    /* Whether the first or last range are going to be totally removed or just
198     * resized/left alone. Note that in the case of first == last, we will split
199     * this into a part before and after the new range. The remove flag is then
200     * whether to not create the corresponding split part. */
201    bool remove_first = parent->ranges[first].offset == offset;
202    bool remove_last = parent->ranges[last].offset + parent->ranges[last].size == offset + size;
203 
204    assert(parent->ranges[first].offset <= offset);
205    assert(parent->ranges[last].offset + parent->ranges[last].size >= offset + size);
206 
207    /* Try to merge the new range with the first range. */
208    if (parent->ranges[first].bo == bo &&
209        (!bo || offset - bo_offset == parent->ranges[first].offset - parent->ranges[first].bo_offset)) {
210       size += offset - parent->ranges[first].offset;
211       offset = parent->ranges[first].offset;
212       bo_offset = parent->ranges[first].bo_offset;
213       remove_first = true;
214    }
215 
216    /* Try to merge the new range with the last range. */
217    if (parent->ranges[last].bo == bo &&
218        (!bo || offset - bo_offset == parent->ranges[last].offset - parent->ranges[last].bo_offset)) {
219       size = parent->ranges[last].offset + parent->ranges[last].size - offset;
220       remove_last = true;
221    }
222 
223    range_count_delta = 1 - (last - first + 1) + !remove_first + !remove_last;
224    new_idx = first + !remove_first;
225 
226    /* If the first/last range are not left alone we unmap then and optionally map
227     * them again after modifications. Not that this implicitly can do the splitting
228     * if first == last. */
229    new_first = parent->ranges[first];
230    new_last = parent->ranges[last];
231 
232    if (parent->ranges[first].offset + parent->ranges[first].size > offset || remove_first) {
233       if (!remove_first) {
234          new_first.size = offset - new_first.offset;
235       }
236    }
237 
238    if (parent->ranges[last].offset < offset + size || remove_last) {
239       if (!remove_last) {
240          new_last.size -= offset + size - new_last.offset;
241          new_last.bo_offset += (offset + size - new_last.offset);
242          new_last.offset = offset + size;
243       }
244    }
245 
246    /* Moves the range list after last to account for the changed number of ranges. */
247    memmove(parent->ranges + last + 1 + range_count_delta, parent->ranges + last + 1,
248            sizeof(struct radv_amdgpu_map_range) * (parent->range_count - last - 1));
249 
250    if (!remove_first)
251       parent->ranges[first] = new_first;
252 
253    if (!remove_last)
254       parent->ranges[new_idx + 1] = new_last;
255 
256    /* Actually set up the new range. */
257    parent->ranges[new_idx].offset = offset;
258    parent->ranges[new_idx].size = size;
259    parent->ranges[new_idx].bo = bo;
260    parent->ranges[new_idx].bo_offset = bo_offset;
261 
262    parent->range_count += range_count_delta;
263 
264    result = radv_amdgpu_winsys_rebuild_bo_list(parent);
265    if (result != VK_SUCCESS)
266       return result;
267 
268    return VK_SUCCESS;
269 }
270 
271 static void
radv_amdgpu_log_bo(struct radv_amdgpu_winsys * ws,struct radv_amdgpu_winsys_bo * bo,bool destroyed)272 radv_amdgpu_log_bo(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo, bool destroyed)
273 {
274    struct radv_amdgpu_winsys_bo_log *bo_log = NULL;
275 
276    if (!ws->debug_log_bos)
277       return;
278 
279    bo_log = calloc(1, sizeof(*bo_log));
280    if (!bo_log)
281       return;
282 
283    bo_log->va = bo->base.va;
284    bo_log->size = bo->base.size;
285    bo_log->timestamp = os_time_get_nano();
286    bo_log->is_virtual = bo->is_virtual;
287    bo_log->destroyed = destroyed;
288 
289    u_rwlock_wrlock(&ws->log_bo_list_lock);
290    list_addtail(&bo_log->list, &ws->log_bo_list);
291    u_rwlock_wrunlock(&ws->log_bo_list_lock);
292 }
293 
294 static int
radv_amdgpu_global_bo_list_add(struct radv_amdgpu_winsys * ws,struct radv_amdgpu_winsys_bo * bo)295 radv_amdgpu_global_bo_list_add(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo)
296 {
297    u_rwlock_wrlock(&ws->global_bo_list.lock);
298    if (ws->global_bo_list.count == ws->global_bo_list.capacity) {
299       unsigned capacity = MAX2(4, ws->global_bo_list.capacity * 2);
300       void *data = realloc(ws->global_bo_list.bos, capacity * sizeof(struct radv_amdgpu_winsys_bo *));
301       if (!data) {
302          u_rwlock_wrunlock(&ws->global_bo_list.lock);
303          return VK_ERROR_OUT_OF_HOST_MEMORY;
304       }
305 
306       ws->global_bo_list.bos = (struct radv_amdgpu_winsys_bo **)data;
307       ws->global_bo_list.capacity = capacity;
308    }
309 
310    ws->global_bo_list.bos[ws->global_bo_list.count++] = bo;
311    bo->base.use_global_list = true;
312    u_rwlock_wrunlock(&ws->global_bo_list.lock);
313    return VK_SUCCESS;
314 }
315 
316 static void
radv_amdgpu_global_bo_list_del(struct radv_amdgpu_winsys * ws,struct radv_amdgpu_winsys_bo * bo)317 radv_amdgpu_global_bo_list_del(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo)
318 {
319    u_rwlock_wrlock(&ws->global_bo_list.lock);
320    for (unsigned i = ws->global_bo_list.count; i-- > 0;) {
321       if (ws->global_bo_list.bos[i] == bo) {
322          ws->global_bo_list.bos[i] = ws->global_bo_list.bos[ws->global_bo_list.count - 1];
323          --ws->global_bo_list.count;
324          bo->base.use_global_list = false;
325          break;
326       }
327    }
328    u_rwlock_wrunlock(&ws->global_bo_list.lock);
329 }
330 
331 static void
radv_amdgpu_winsys_bo_destroy(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo)332 radv_amdgpu_winsys_bo_destroy(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo)
333 {
334    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
335    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
336 
337    radv_amdgpu_log_bo(ws, bo, true);
338 
339    if (bo->is_virtual) {
340       int r;
341 
342       /* Clear mappings of this PRT VA region. */
343       r = radv_amdgpu_bo_va_op(ws, 0, 0, bo->base.size, bo->base.va, 0, 0, AMDGPU_VA_OP_CLEAR);
344       if (r) {
345          fprintf(stderr, "radv/amdgpu: Failed to clear a PRT VA region (%d).\n", r);
346       }
347 
348       free(bo->bos);
349       free(bo->ranges);
350       u_rwlock_destroy(&bo->lock);
351    } else {
352       if (bo->cpu_map)
353          munmap(bo->cpu_map, bo->base.size);
354 
355       if (ws->debug_all_bos)
356          radv_amdgpu_global_bo_list_del(ws, bo);
357       radv_amdgpu_bo_va_op(ws, bo->bo_handle, 0, bo->base.size, bo->base.va, 0, 0, AMDGPU_VA_OP_UNMAP);
358       ac_drm_bo_free(ws->dev, bo->bo);
359    }
360 
361    if (bo->base.initial_domain & RADEON_DOMAIN_VRAM) {
362       if (bo->base.vram_no_cpu_access) {
363          p_atomic_add(&ws->allocated_vram, -align64(bo->base.size, ws->info.gart_page_size));
364       } else {
365          p_atomic_add(&ws->allocated_vram_vis, -align64(bo->base.size, ws->info.gart_page_size));
366       }
367    }
368 
369    if (bo->base.initial_domain & RADEON_DOMAIN_GTT)
370       p_atomic_add(&ws->allocated_gtt, -align64(bo->base.size, ws->info.gart_page_size));
371 
372    ac_drm_va_range_free(bo->va_handle);
373    FREE(bo);
374 }
375 
376 static VkResult
radv_amdgpu_winsys_bo_create(struct radeon_winsys * _ws,uint64_t size,unsigned alignment,enum radeon_bo_domain initial_domain,enum radeon_bo_flag flags,unsigned priority,uint64_t replay_address,struct radeon_winsys_bo ** out_bo)377 radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws, uint64_t size, unsigned alignment,
378                              enum radeon_bo_domain initial_domain, enum radeon_bo_flag flags, unsigned priority,
379                              uint64_t replay_address, struct radeon_winsys_bo **out_bo)
380 {
381    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
382    struct radv_amdgpu_winsys_bo *bo;
383    struct amdgpu_bo_alloc_request request = {0};
384    struct radv_amdgpu_map_range *ranges = NULL;
385    ac_drm_bo buf_handle;
386    uint64_t va = 0;
387    amdgpu_va_handle va_handle;
388    int r;
389    VkResult result = VK_SUCCESS;
390 
391    /* Just be robust for callers that might use NULL-ness for determining if things should be freed.
392     */
393    *out_bo = NULL;
394 
395    bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo);
396    if (!bo) {
397       return VK_ERROR_OUT_OF_HOST_MEMORY;
398    }
399 
400    unsigned virt_alignment = alignment;
401    if (size >= ws->info.pte_fragment_size)
402       virt_alignment = MAX2(virt_alignment, ws->info.pte_fragment_size);
403 
404    assert(!replay_address || (flags & RADEON_FLAG_REPLAYABLE));
405 
406    const uint64_t va_flags = AMDGPU_VA_RANGE_HIGH | (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
407                              (flags & RADEON_FLAG_REPLAYABLE ? AMDGPU_VA_RANGE_REPLAYABLE : 0);
408    r = ac_drm_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, size, virt_alignment, replay_address, &va,
409                              &va_handle, va_flags);
410    if (r) {
411       result = replay_address ? VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS : VK_ERROR_OUT_OF_DEVICE_MEMORY;
412       goto error_va_alloc;
413    }
414 
415    bo->base.va = va;
416    bo->base.size = size;
417    bo->va_handle = va_handle;
418    bo->is_virtual = !!(flags & RADEON_FLAG_VIRTUAL);
419 
420    if (flags & RADEON_FLAG_VIRTUAL) {
421       ranges = realloc(NULL, sizeof(struct radv_amdgpu_map_range));
422       if (!ranges) {
423          result = VK_ERROR_OUT_OF_HOST_MEMORY;
424          goto error_ranges_alloc;
425       }
426 
427       u_rwlock_init(&bo->lock);
428 
429       bo->ranges = ranges;
430       bo->range_count = 1;
431       bo->range_capacity = 1;
432 
433       bo->ranges[0].offset = 0;
434       bo->ranges[0].size = size;
435       bo->ranges[0].bo = NULL;
436       bo->ranges[0].bo_offset = 0;
437 
438       /* Reserve a PRT VA region. */
439       r = radv_amdgpu_bo_va_op(ws, 0, 0, size, bo->base.va, 0, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
440       if (r) {
441          fprintf(stderr, "radv/amdgpu: Failed to reserve a PRT VA region (%d).\n", r);
442          result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
443          goto error_ranges_alloc;
444       }
445 
446       radv_amdgpu_log_bo(ws, bo, false);
447 
448       *out_bo = (struct radeon_winsys_bo *)bo;
449       return VK_SUCCESS;
450    }
451 
452    request.alloc_size = size;
453    request.phys_alignment = alignment;
454 
455    if (initial_domain & RADEON_DOMAIN_VRAM) {
456       request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
457 
458       /* Since VRAM and GTT have almost the same performance on
459        * APUs, we could just set GTT. However, in order to decrease
460        * GTT(RAM) usage, which is shared with the OS, allow VRAM
461        * placements too. The idea is not to use VRAM usefully, but
462        * to use it so that it's not unused and wasted.
463        *
464        * Furthermore, even on discrete GPUs this is beneficial. If
465        * both GTT and VRAM are set then AMDGPU still prefers VRAM
466        * for the initial placement, but it makes the buffers
467        * spillable. Otherwise AMDGPU tries to place the buffers in
468        * VRAM really hard to the extent that we are getting a lot
469        * of unnecessary movement. This helps significantly when
470        * e.g. Horizon Zero Dawn allocates more memory than we have
471        * VRAM.
472        */
473       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
474    }
475 
476    if (initial_domain & RADEON_DOMAIN_GTT)
477       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
478    if (initial_domain & RADEON_DOMAIN_GDS)
479       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;
480    if (initial_domain & RADEON_DOMAIN_OA)
481       request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;
482 
483    if (flags & RADEON_FLAG_CPU_ACCESS)
484       request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
485    if (flags & RADEON_FLAG_NO_CPU_ACCESS) {
486       bo->base.vram_no_cpu_access = initial_domain & RADEON_DOMAIN_VRAM;
487       request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
488    }
489    if (flags & RADEON_FLAG_GTT_WC)
490       request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
491    if (!(flags & RADEON_FLAG_IMPLICIT_SYNC))
492       request.flags |= AMDGPU_GEM_CREATE_EXPLICIT_SYNC;
493    if ((initial_domain & RADEON_DOMAIN_VRAM_GTT) && (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) &&
494        ((ws->perftest & RADV_PERFTEST_LOCAL_BOS) || (flags & RADEON_FLAG_PREFER_LOCAL_BO))) {
495       /* virtio needs to be able to create a dmabuf if CPU access is required but a
496        * dmabuf cannot be created if VM_ALWAYS_VALID is used.
497        */
498       if (!ws->info.is_virtio || (request.flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)) {
499          bo->base.is_local = true;
500          request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
501       }
502    }
503    /* Set AMDGPU_GEM_CREATE_VIRTIO_SHARED if the driver didn't disable buffer sharing. */
504    if (ws->info.is_virtio && (initial_domain & RADEON_DOMAIN_VRAM_GTT) &&
505        (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) == 0)
506       request.flags |= AMDGPU_GEM_CREATE_VIRTIO_SHARED;
507    if (initial_domain & RADEON_DOMAIN_VRAM) {
508       if (ws->zero_all_vram_allocs || (flags & RADEON_FLAG_ZERO_VRAM))
509          request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
510    }
511 
512    if (flags & RADEON_FLAG_DISCARDABLE && ws->info.drm_minor >= 47)
513       request.flags |= AMDGPU_GEM_CREATE_DISCARDABLE;
514 
515    if (flags & RADEON_FLAG_GFX12_ALLOW_DCC && ws->info.drm_minor >= 58) {
516       assert(ws->info.gfx_level >= GFX12 && (initial_domain & RADEON_DOMAIN_VRAM));
517       bo->base.gfx12_allow_dcc = true;
518       request.flags |= AMDGPU_GEM_CREATE_GFX12_DCC;
519    }
520 
521    r = ac_drm_bo_alloc(ws->dev, &request, &buf_handle);
522    if (r) {
523       fprintf(stderr, "radv/amdgpu: Failed to allocate a buffer:\n");
524       fprintf(stderr, "radv/amdgpu:    size      : %" PRIu64 " bytes\n", size);
525       fprintf(stderr, "radv/amdgpu:    alignment : %u bytes\n", alignment);
526       fprintf(stderr, "radv/amdgpu:    domains   : %u\n", initial_domain);
527       result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
528       goto error_bo_alloc;
529    }
530 
531    uint32_t kms_handle = 0;
532    r = ac_drm_bo_export(ws->dev, buf_handle, amdgpu_bo_handle_type_kms, &kms_handle);
533    assert(!r);
534 
535    r = radv_amdgpu_bo_va_op(ws, kms_handle, 0, size, va, flags, 0, AMDGPU_VA_OP_MAP);
536    if (r) {
537       result = VK_ERROR_UNKNOWN;
538       goto error_va_map;
539    }
540 
541    bo->bo = buf_handle;
542    bo->bo_handle = kms_handle;
543    bo->base.initial_domain = initial_domain;
544    bo->base.use_global_list = false;
545    bo->priority = priority;
546    bo->cpu_map = NULL;
547 
548    if (initial_domain & RADEON_DOMAIN_VRAM) {
549       /* Buffers allocated in VRAM with the NO_CPU_ACCESS flag
550        * aren't mappable and they are counted as part of the VRAM
551        * counter.
552        *
553        * Otherwise, buffers with the CPU_ACCESS flag or without any
554        * of both (imported buffers) are counted as part of the VRAM
555        * visible counter because they can be mapped.
556        */
557       if (bo->base.vram_no_cpu_access) {
558          p_atomic_add(&ws->allocated_vram, align64(bo->base.size, ws->info.gart_page_size));
559       } else {
560          p_atomic_add(&ws->allocated_vram_vis, align64(bo->base.size, ws->info.gart_page_size));
561       }
562    }
563 
564    if (initial_domain & RADEON_DOMAIN_GTT)
565       p_atomic_add(&ws->allocated_gtt, align64(bo->base.size, ws->info.gart_page_size));
566 
567    if (ws->debug_all_bos)
568       radv_amdgpu_global_bo_list_add(ws, bo);
569    radv_amdgpu_log_bo(ws, bo, false);
570 
571    *out_bo = (struct radeon_winsys_bo *)bo;
572    return VK_SUCCESS;
573 error_va_map:
574    ac_drm_bo_free(ws->dev, buf_handle);
575 
576 error_bo_alloc:
577    free(ranges);
578 
579 error_ranges_alloc:
580    ac_drm_va_range_free(va_handle);
581 
582 error_va_alloc:
583    FREE(bo);
584    return result;
585 }
586 
587 static void *
radv_amdgpu_winsys_bo_map(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,bool use_fixed_addr,void * fixed_addr)588 radv_amdgpu_winsys_bo_map(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, bool use_fixed_addr,
589                           void *fixed_addr)
590 {
591    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
592 
593    /* Safeguard for the Quantic Dream layer skipping unmaps. */
594    if (bo->cpu_map && !use_fixed_addr)
595       return bo->cpu_map;
596 
597    assert(!bo->cpu_map);
598 
599 #if HAVE_AMDGPU_VIRTIO
600    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
601    if (ws->info.is_virtio) {
602       /* We can't use DRM_AMDGPU_GEM_MMAP directly on virtio. Instead use bo_cpu_map since
603        * the virtio version will map the buffer at the given address (if not NULL).
604        */
605       void *data = NULL;
606       if (use_fixed_addr)
607          data = fixed_addr;
608 
609       if (ac_drm_bo_cpu_map(ws->dev, bo->bo, &data))
610          return NULL;
611       return data;
612    }
613 #endif
614 
615    union drm_amdgpu_gem_mmap args;
616    memset(&args, 0, sizeof(args));
617    args.in.handle = bo->bo_handle;
618 
619    int ret = drm_ioctl_write_read(radv_amdgpu_winsys(_ws)->fd, DRM_AMDGPU_GEM_MMAP, &args, sizeof(args));
620    if (ret)
621       return NULL;
622 
623    void *data = mmap(fixed_addr, bo->base.size, PROT_READ | PROT_WRITE, MAP_SHARED | (use_fixed_addr ? MAP_FIXED : 0),
624                      radv_amdgpu_winsys(_ws)->fd, args.out.addr_ptr);
625    if (data == MAP_FAILED)
626       return NULL;
627 
628    bo->cpu_map = data;
629    return data;
630 }
631 
632 static void
radv_amdgpu_winsys_bo_unmap(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,bool replace)633 radv_amdgpu_winsys_bo_unmap(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, bool replace)
634 {
635    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
636 
637    /* Defense in depth against buggy apps. */
638    if (!bo->cpu_map && !replace)
639       return;
640 
641    assert(bo->cpu_map);
642    if (replace) {
643       (void)mmap(bo->cpu_map, bo->base.size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
644    } else {
645 #if HAVE_AMDGPU_VIRTIO
646       struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
647       if (ws->info.is_virtio)
648          ac_drm_bo_cpu_unmap(ws->dev, bo->bo);
649       else
650 #endif
651          munmap(bo->cpu_map, bo->base.size);
652    }
653    bo->cpu_map = NULL;
654 }
655 
656 static uint64_t
radv_amdgpu_get_optimal_vm_alignment(struct radv_amdgpu_winsys * ws,uint64_t size,unsigned alignment)657 radv_amdgpu_get_optimal_vm_alignment(struct radv_amdgpu_winsys *ws, uint64_t size, unsigned alignment)
658 {
659    uint64_t vm_alignment = alignment;
660 
661    /* Increase the VM alignment for faster address translation. */
662    if (size >= ws->info.pte_fragment_size)
663       vm_alignment = MAX2(vm_alignment, ws->info.pte_fragment_size);
664 
665    /* Gfx9: Increase the VM alignment to the most significant bit set
666     * in the size for faster address translation.
667     */
668    if (ws->info.gfx_level >= GFX9) {
669       unsigned msb = util_last_bit64(size); /* 0 = no bit is set */
670       uint64_t msb_alignment = msb ? 1ull << (msb - 1) : 0;
671 
672       vm_alignment = MAX2(vm_alignment, msb_alignment);
673    }
674    return vm_alignment;
675 }
676 
677 static VkResult
radv_amdgpu_winsys_bo_from_ptr(struct radeon_winsys * _ws,void * pointer,uint64_t size,unsigned priority,struct radeon_winsys_bo ** out_bo)678 radv_amdgpu_winsys_bo_from_ptr(struct radeon_winsys *_ws, void *pointer, uint64_t size, unsigned priority,
679                                struct radeon_winsys_bo **out_bo)
680 {
681    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
682    ac_drm_bo buf_handle;
683    struct radv_amdgpu_winsys_bo *bo;
684    uint64_t va;
685    amdgpu_va_handle va_handle;
686    uint64_t vm_alignment;
687    VkResult result = VK_SUCCESS;
688    int ret;
689 
690    /* Just be robust for callers that might use NULL-ness for determining if things should be freed.
691     */
692    *out_bo = NULL;
693 
694    bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo);
695    if (!bo)
696       return VK_ERROR_OUT_OF_HOST_MEMORY;
697 
698    ret = ac_drm_create_bo_from_user_mem(ws->dev, pointer, size, &buf_handle);
699    if (ret) {
700       if (ret == -EINVAL) {
701          result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
702       } else {
703          result = VK_ERROR_UNKNOWN;
704       }
705       goto error;
706    }
707 
708    /* Using the optimal VM alignment also fixes GPU hangs for buffers that
709     * are imported.
710     */
711    vm_alignment = radv_amdgpu_get_optimal_vm_alignment(ws, size, ws->info.gart_page_size);
712 
713    if (ac_drm_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, size, vm_alignment, 0, &va, &va_handle,
714                              AMDGPU_VA_RANGE_HIGH)) {
715       result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
716       goto error_va_alloc;
717    }
718 
719    uint32_t kms_handle = 0;
720    ASSERTED int r = ac_drm_bo_export(ws->dev, buf_handle, amdgpu_bo_handle_type_kms, &kms_handle);
721    assert(!r);
722 
723    if (ac_drm_bo_va_op(ws->dev, kms_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP)) {
724       result = VK_ERROR_UNKNOWN;
725       goto error_va_map;
726    }
727 
728    /* Initialize it */
729    bo->base.va = va;
730    bo->va_handle = va_handle;
731    bo->base.size = size;
732    bo->bo = buf_handle;
733    bo->bo_handle = kms_handle;
734    bo->base.initial_domain = RADEON_DOMAIN_GTT;
735    bo->base.use_global_list = false;
736    bo->priority = priority;
737    bo->cpu_map = NULL;
738 
739    p_atomic_add(&ws->allocated_gtt, align64(bo->base.size, ws->info.gart_page_size));
740 
741    if (ws->debug_all_bos)
742       radv_amdgpu_global_bo_list_add(ws, bo);
743    radv_amdgpu_log_bo(ws, bo, false);
744 
745    *out_bo = (struct radeon_winsys_bo *)bo;
746    return VK_SUCCESS;
747 
748 error_va_map:
749    ac_drm_va_range_free(va_handle);
750 
751 error_va_alloc:
752    ac_drm_bo_free(ws->dev, buf_handle);
753 
754 error:
755    FREE(bo);
756    return result;
757 }
758 
759 static VkResult
radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys * _ws,int fd,unsigned priority,struct radeon_winsys_bo ** out_bo,uint64_t * alloc_size)760 radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys *_ws, int fd, unsigned priority, struct radeon_winsys_bo **out_bo,
761                               uint64_t *alloc_size)
762 {
763    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
764    struct radv_amdgpu_winsys_bo *bo;
765    uint64_t va;
766    amdgpu_va_handle va_handle;
767    enum amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd;
768    struct ac_drm_bo_import_result result;
769    struct amdgpu_bo_info info;
770    enum radeon_bo_domain initial = 0;
771    int r;
772    VkResult vk_result = VK_SUCCESS;
773 
774    /* Just be robust for callers that might use NULL-ness for determining if things should be freed.
775     */
776    *out_bo = NULL;
777 
778    bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo);
779    if (!bo)
780       return VK_ERROR_OUT_OF_HOST_MEMORY;
781 
782    r = ac_drm_bo_import(ws->dev, type, fd, &result);
783    if (r) {
784       vk_result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
785       goto error;
786    }
787 
788    uint32_t kms_handle = 0;
789    r = ac_drm_bo_export(ws->dev, result.bo, amdgpu_bo_handle_type_kms, &kms_handle);
790    assert(!r);
791 
792    r = ac_drm_bo_query_info(ws->dev, kms_handle, &info);
793    if (r) {
794       vk_result = VK_ERROR_UNKNOWN;
795       goto error_query;
796    }
797 
798    if (alloc_size) {
799       *alloc_size = info.alloc_size;
800    }
801 
802    r = ac_drm_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, result.alloc_size, 1 << 20, 0, &va, &va_handle,
803                              AMDGPU_VA_RANGE_HIGH);
804    if (r) {
805       vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
806       goto error_query;
807    }
808 
809    r = radv_amdgpu_bo_va_op(ws, kms_handle, 0, result.alloc_size, va, 0, 0, AMDGPU_VA_OP_MAP);
810    if (r) {
811       vk_result = VK_ERROR_UNKNOWN;
812       goto error_va_map;
813    }
814 
815    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
816       initial |= RADEON_DOMAIN_VRAM;
817    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
818       initial |= RADEON_DOMAIN_GTT;
819 
820    bo->bo = result.bo;
821    bo->bo_handle = kms_handle;
822    bo->base.va = va;
823    bo->va_handle = va_handle;
824    bo->base.initial_domain = initial;
825    bo->base.use_global_list = false;
826    bo->base.size = result.alloc_size;
827    bo->priority = priority;
828    bo->cpu_map = NULL;
829 
830    if (bo->base.initial_domain & RADEON_DOMAIN_VRAM)
831       p_atomic_add(&ws->allocated_vram, align64(bo->base.size, ws->info.gart_page_size));
832    if (bo->base.initial_domain & RADEON_DOMAIN_GTT)
833       p_atomic_add(&ws->allocated_gtt, align64(bo->base.size, ws->info.gart_page_size));
834 
835    if (ws->debug_all_bos)
836       radv_amdgpu_global_bo_list_add(ws, bo);
837    radv_amdgpu_log_bo(ws, bo, false);
838 
839    *out_bo = (struct radeon_winsys_bo *)bo;
840    return VK_SUCCESS;
841 error_va_map:
842    ac_drm_va_range_free(va_handle);
843 
844 error_query:
845    ac_drm_bo_free(ws->dev, result.bo);
846 
847 error:
848    FREE(bo);
849    return vk_result;
850 }
851 
852 static bool
radv_amdgpu_winsys_get_fd(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,int * fd)853 radv_amdgpu_winsys_get_fd(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, int *fd)
854 {
855    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
856    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
857    enum amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd;
858    int r;
859    unsigned handle;
860    r = ac_drm_bo_export(ws->dev, bo->bo, type, &handle);
861    if (r)
862       return false;
863 
864    *fd = (int)handle;
865    return true;
866 }
867 
868 static bool
radv_amdgpu_bo_get_flags_from_fd(struct radeon_winsys * _ws,int fd,enum radeon_bo_domain * domains,enum radeon_bo_flag * flags)869 radv_amdgpu_bo_get_flags_from_fd(struct radeon_winsys *_ws, int fd, enum radeon_bo_domain *domains,
870                                  enum radeon_bo_flag *flags)
871 {
872    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
873    struct ac_drm_bo_import_result result = {0};
874    struct amdgpu_bo_info info = {0};
875    int r;
876 
877    *domains = 0;
878    *flags = 0;
879 
880    r = ac_drm_bo_import(ws->dev, amdgpu_bo_handle_type_dma_buf_fd, fd, &result);
881    if (r)
882       return false;
883 
884    uint32_t kms_handle = 0;
885    r = ac_drm_bo_export(ws->dev, result.bo, amdgpu_bo_handle_type_kms, &kms_handle);
886    assert(!r);
887 
888    r = ac_drm_bo_query_info(ws->dev, kms_handle, &info);
889    ac_drm_bo_free(ws->dev, result.bo);
890    if (r)
891       return false;
892 
893    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
894       *domains |= RADEON_DOMAIN_VRAM;
895    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
896       *domains |= RADEON_DOMAIN_GTT;
897    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GDS)
898       *domains |= RADEON_DOMAIN_GDS;
899    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_OA)
900       *domains |= RADEON_DOMAIN_OA;
901 
902    if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
903       *flags |= RADEON_FLAG_CPU_ACCESS;
904    if (info.alloc_flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)
905       *flags |= RADEON_FLAG_NO_CPU_ACCESS;
906    if (!(info.alloc_flags & AMDGPU_GEM_CREATE_EXPLICIT_SYNC))
907       *flags |= RADEON_FLAG_IMPLICIT_SYNC;
908    if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
909       *flags |= RADEON_FLAG_GTT_WC;
910    if (info.alloc_flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
911       *flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_PREFER_LOCAL_BO;
912    if (info.alloc_flags & AMDGPU_GEM_CREATE_VRAM_CLEARED)
913       *flags |= RADEON_FLAG_ZERO_VRAM;
914    if (info.alloc_flags & AMDGPU_GEM_CREATE_GFX12_DCC)
915       *flags |= RADEON_FLAG_GFX12_ALLOW_DCC;
916    return true;
917 }
918 
919 static unsigned
eg_tile_split(unsigned tile_split)920 eg_tile_split(unsigned tile_split)
921 {
922    switch (tile_split) {
923    case 0:
924       tile_split = 64;
925       break;
926    case 1:
927       tile_split = 128;
928       break;
929    case 2:
930       tile_split = 256;
931       break;
932    case 3:
933       tile_split = 512;
934       break;
935    default:
936    case 4:
937       tile_split = 1024;
938       break;
939    case 5:
940       tile_split = 2048;
941       break;
942    case 6:
943       tile_split = 4096;
944       break;
945    }
946    return tile_split;
947 }
948 
949 static unsigned
radv_eg_tile_split_rev(unsigned eg_tile_split)950 radv_eg_tile_split_rev(unsigned eg_tile_split)
951 {
952    switch (eg_tile_split) {
953    case 64:
954       return 0;
955    case 128:
956       return 1;
957    case 256:
958       return 2;
959    case 512:
960       return 3;
961    default:
962    case 1024:
963       return 4;
964    case 2048:
965       return 5;
966    case 4096:
967       return 6;
968    }
969 }
970 
971 #define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_SHIFT 45
972 #define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_MASK  0x3
973 
974 static void
radv_amdgpu_winsys_bo_set_metadata(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,struct radeon_bo_metadata * md)975 radv_amdgpu_winsys_bo_set_metadata(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo,
976                                    struct radeon_bo_metadata *md)
977 {
978    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
979    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
980    struct amdgpu_bo_metadata metadata = {0};
981    uint64_t tiling_flags = 0;
982 
983    if (ws->info.gfx_level >= GFX12) {
984       tiling_flags |= AMDGPU_TILING_SET(GFX12_SWIZZLE_MODE, md->u.gfx12.swizzle_mode);
985       tiling_flags |= AMDGPU_TILING_SET(GFX12_DCC_MAX_COMPRESSED_BLOCK, md->u.gfx12.dcc_max_compressed_block);
986       tiling_flags |= AMDGPU_TILING_SET(GFX12_DCC_NUMBER_TYPE, md->u.gfx12.dcc_number_type);
987       tiling_flags |= AMDGPU_TILING_SET(GFX12_DCC_DATA_FORMAT, md->u.gfx12.dcc_data_format);
988       tiling_flags |= AMDGPU_TILING_SET(GFX12_DCC_WRITE_COMPRESS_DISABLE, md->u.gfx12.dcc_write_compress_disable);
989       tiling_flags |= AMDGPU_TILING_SET(GFX12_SCANOUT, md->u.gfx12.scanout);
990    } else if (ws->info.gfx_level >= GFX9) {
991       tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode);
992       tiling_flags |= AMDGPU_TILING_SET(DCC_OFFSET_256B, md->u.gfx9.dcc_offset_256b);
993       tiling_flags |= AMDGPU_TILING_SET(DCC_PITCH_MAX, md->u.gfx9.dcc_pitch_max);
994       tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, md->u.gfx9.dcc_independent_64b_blocks);
995       tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_128B, md->u.gfx9.dcc_independent_128b_blocks);
996       tiling_flags |= AMDGPU_TILING_SET(DCC_MAX_COMPRESSED_BLOCK_SIZE, md->u.gfx9.dcc_max_compressed_block_size);
997       tiling_flags |= AMDGPU_TILING_SET(SCANOUT, md->u.gfx9.scanout);
998    } else {
999       if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED)
1000          tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
1001       else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED)
1002          tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
1003       else
1004          tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
1005 
1006       tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config);
1007       tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw));
1008       tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh));
1009       if (md->u.legacy.tile_split)
1010          tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, radv_eg_tile_split_rev(md->u.legacy.tile_split));
1011       tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea));
1012       tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks) - 1);
1013 
1014       if (md->u.legacy.scanout)
1015          tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
1016       else
1017          tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
1018    }
1019 
1020    metadata.tiling_info = tiling_flags;
1021    metadata.size_metadata = md->size_metadata;
1022    memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
1023 
1024    ac_drm_bo_set_metadata(ws->dev, bo->bo_handle, &metadata);
1025 }
1026 
1027 static void
radv_amdgpu_winsys_bo_get_metadata(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,struct radeon_bo_metadata * md)1028 radv_amdgpu_winsys_bo_get_metadata(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo,
1029                                    struct radeon_bo_metadata *md)
1030 {
1031    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1032    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
1033    struct amdgpu_bo_info info = {0};
1034 
1035    int r = ac_drm_bo_query_info(ws->dev, bo->bo_handle, &info);
1036    if (r)
1037       return;
1038 
1039    uint64_t tiling_flags = info.metadata.tiling_info;
1040 
1041    if (ws->info.gfx_level >= GFX12) {
1042       md->u.gfx12.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, GFX12_SWIZZLE_MODE);
1043       md->u.gfx12.dcc_max_compressed_block = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_MAX_COMPRESSED_BLOCK);
1044       md->u.gfx12.dcc_data_format = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_DATA_FORMAT);
1045       md->u.gfx12.dcc_number_type = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_NUMBER_TYPE);
1046       md->u.gfx12.dcc_write_compress_disable = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_WRITE_COMPRESS_DISABLE);
1047       md->u.gfx12.scanout = AMDGPU_TILING_GET(tiling_flags, GFX12_SCANOUT);
1048    } else if (ws->info.gfx_level >= GFX9) {
1049       md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE);
1050       md->u.gfx9.scanout = AMDGPU_TILING_GET(tiling_flags, SCANOUT);
1051    } else {
1052       md->u.legacy.microtile = RADEON_LAYOUT_LINEAR;
1053       md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR;
1054 
1055       if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */
1056          md->u.legacy.macrotile = RADEON_LAYOUT_TILED;
1057       else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
1058          md->u.legacy.microtile = RADEON_LAYOUT_TILED;
1059 
1060       md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG);
1061       md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
1062       md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
1063       md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
1064       md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
1065       md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS);
1066       md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
1067    }
1068 
1069    md->size_metadata = info.metadata.size_metadata;
1070    memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
1071 }
1072 
1073 static VkResult
radv_amdgpu_winsys_bo_make_resident(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,bool resident)1074 radv_amdgpu_winsys_bo_make_resident(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, bool resident)
1075 {
1076    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1077    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
1078    VkResult result = VK_SUCCESS;
1079 
1080    /* Do not add the BO to the global list if it's a local BO because the
1081     * kernel maintains a list for us.
1082     */
1083    if (bo->base.is_local)
1084       return VK_SUCCESS;
1085 
1086    /* Do not add the BO twice to the global list if the allbos debug
1087     * option is enabled.
1088     */
1089    if (ws->debug_all_bos)
1090       return VK_SUCCESS;
1091 
1092    if (resident) {
1093       result = radv_amdgpu_global_bo_list_add(ws, bo);
1094    } else {
1095       radv_amdgpu_global_bo_list_del(ws, bo);
1096    }
1097 
1098    return result;
1099 }
1100 
1101 static int
radv_amdgpu_bo_va_compare(const void * a,const void * b)1102 radv_amdgpu_bo_va_compare(const void *a, const void *b)
1103 {
1104    const struct radv_amdgpu_winsys_bo *bo_a = *(const struct radv_amdgpu_winsys_bo *const *)a;
1105    const struct radv_amdgpu_winsys_bo *bo_b = *(const struct radv_amdgpu_winsys_bo *const *)b;
1106    return bo_a->base.va < bo_b->base.va ? -1 : bo_a->base.va > bo_b->base.va ? 1 : 0;
1107 }
1108 
1109 static uint64_t
radv_amdgpu_canonicalize_va(uint64_t va)1110 radv_amdgpu_canonicalize_va(uint64_t va)
1111 {
1112    /* Would be less hardcoded to use addr32_hi (0xffff8000) to generate a mask,
1113     * but there are confusing differences between page fault reports from kernel where
1114     * it seems to report the top 48 bits, where addr32_hi has 47-bits. */
1115    return va & ((1ull << 48) - 1);
1116 }
1117 
1118 static void
radv_amdgpu_dump_bo_log(struct radeon_winsys * _ws,FILE * file)1119 radv_amdgpu_dump_bo_log(struct radeon_winsys *_ws, FILE *file)
1120 {
1121    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1122    struct radv_amdgpu_winsys_bo_log *bo_log;
1123 
1124    if (!ws->debug_log_bos)
1125       return;
1126 
1127    u_rwlock_rdlock(&ws->log_bo_list_lock);
1128    LIST_FOR_EACH_ENTRY (bo_log, &ws->log_bo_list, list) {
1129       if (bo_log->virtual_mapping) {
1130          fprintf(file, "timestamp=%llu, VA=%.16llx-%.16llx, mapped_to=%.16llx\n",
1131                  (long long)bo_log->timestamp,
1132                  (long long)radv_amdgpu_canonicalize_va(bo_log->va),
1133                  (long long)radv_amdgpu_canonicalize_va(bo_log->va + bo_log->size),
1134                  (long long)radv_amdgpu_canonicalize_va(bo_log->mapped_va));
1135       } else {
1136          fprintf(file, "timestamp=%llu, VA=%.16llx-%.16llx, destroyed=%d, is_virtual=%d\n",
1137                  (long long)bo_log->timestamp,
1138                  (long long)radv_amdgpu_canonicalize_va(bo_log->va),
1139                  (long long)radv_amdgpu_canonicalize_va(bo_log->va + bo_log->size), bo_log->destroyed,
1140                  bo_log->is_virtual);
1141       }
1142    }
1143    u_rwlock_rdunlock(&ws->log_bo_list_lock);
1144 }
1145 
1146 static void
radv_amdgpu_dump_bo_ranges(struct radeon_winsys * _ws,FILE * file)1147 radv_amdgpu_dump_bo_ranges(struct radeon_winsys *_ws, FILE *file)
1148 {
1149    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1150    if (ws->debug_all_bos) {
1151       struct radv_amdgpu_winsys_bo **bos = NULL;
1152       int i = 0;
1153 
1154       u_rwlock_rdlock(&ws->global_bo_list.lock);
1155       bos = malloc(sizeof(*bos) * ws->global_bo_list.count);
1156       if (!bos) {
1157          u_rwlock_rdunlock(&ws->global_bo_list.lock);
1158          fprintf(file, "  Failed to allocate memory to sort VA ranges for dumping\n");
1159          return;
1160       }
1161 
1162       for (i = 0; i < ws->global_bo_list.count; i++) {
1163          bos[i] = ws->global_bo_list.bos[i];
1164       }
1165       qsort(bos, ws->global_bo_list.count, sizeof(bos[0]), radv_amdgpu_bo_va_compare);
1166 
1167       for (i = 0; i < ws->global_bo_list.count; ++i) {
1168          fprintf(file, "  VA=%.16llx-%.16llx, handle=%d\n", (long long)radv_amdgpu_canonicalize_va(bos[i]->base.va),
1169                  (long long)radv_amdgpu_canonicalize_va(bos[i]->base.va + bos[i]->base.size), bos[i]->bo_handle);
1170       }
1171       free(bos);
1172       u_rwlock_rdunlock(&ws->global_bo_list.lock);
1173    } else
1174       fprintf(file, "  To get BO VA ranges, please specify RADV_DEBUG=allbos\n");
1175 }
1176 void
radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys * ws)1177 radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys *ws)
1178 {
1179    ws->base.buffer_create = radv_amdgpu_winsys_bo_create;
1180    ws->base.buffer_destroy = radv_amdgpu_winsys_bo_destroy;
1181    ws->base.buffer_map = radv_amdgpu_winsys_bo_map;
1182    ws->base.buffer_unmap = radv_amdgpu_winsys_bo_unmap;
1183    ws->base.buffer_from_ptr = radv_amdgpu_winsys_bo_from_ptr;
1184    ws->base.buffer_from_fd = radv_amdgpu_winsys_bo_from_fd;
1185    ws->base.buffer_get_fd = radv_amdgpu_winsys_get_fd;
1186    ws->base.buffer_set_metadata = radv_amdgpu_winsys_bo_set_metadata;
1187    ws->base.buffer_get_metadata = radv_amdgpu_winsys_bo_get_metadata;
1188    ws->base.buffer_virtual_bind = radv_amdgpu_winsys_bo_virtual_bind;
1189    ws->base.buffer_get_flags_from_fd = radv_amdgpu_bo_get_flags_from_fd;
1190    ws->base.buffer_make_resident = radv_amdgpu_winsys_bo_make_resident;
1191    ws->base.dump_bo_ranges = radv_amdgpu_dump_bo_ranges;
1192    ws->base.dump_bo_log = radv_amdgpu_dump_bo_log;
1193 }
1194