1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based on amdgpu winsys.
6 * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
7 * Copyright © 2015 Advanced Micro Devices, Inc.
8 *
9 * SPDX-License-Identifier: MIT
10 */
11
12 #include <stdio.h>
13
14 #include "radv_amdgpu_bo.h"
15 #include "radv_debug.h"
16
17 #include <amdgpu.h>
18 #include <inttypes.h>
19 #include <pthread.h>
20 #include <unistd.h>
21 #include <xf86drm.h>
22 #include "drm-uapi/amdgpu_drm.h"
23 #include <sys/mman.h>
24 #include "ac_linux_drm.h"
25
26 #include "util/os_drm.h"
27 #include "util/os_time.h"
28 #include "util/u_atomic.h"
29 #include "util/u_math.h"
30 #include "util/u_memory.h"
31
32 static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo);
33
34 static int
radv_amdgpu_bo_va_op(struct radv_amdgpu_winsys * ws,uint32_t bo_handle,uint64_t offset,uint64_t size,uint64_t addr,uint32_t bo_flags,uint64_t internal_flags,uint32_t ops)35 radv_amdgpu_bo_va_op(struct radv_amdgpu_winsys *ws, uint32_t bo_handle, uint64_t offset, uint64_t size, uint64_t addr,
36 uint32_t bo_flags, uint64_t internal_flags, uint32_t ops)
37 {
38 uint64_t flags = internal_flags;
39 if (bo_handle) {
40 flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_EXECUTABLE;
41
42 if ((bo_flags & RADEON_FLAG_VA_UNCACHED) && ws->info.gfx_level >= GFX9)
43 flags |= AMDGPU_VM_MTYPE_UC;
44
45 if (!(bo_flags & RADEON_FLAG_READ_ONLY))
46 flags |= AMDGPU_VM_PAGE_WRITEABLE;
47 }
48
49 size = align64(size, getpagesize());
50
51 return ac_drm_bo_va_op_raw(ws->dev, bo_handle, offset, size, addr, flags, ops);
52 }
53
54 static int
bo_comparator(const void * ap,const void * bp)55 bo_comparator(const void *ap, const void *bp)
56 {
57 struct radv_amdgpu_bo *a = *(struct radv_amdgpu_bo *const *)ap;
58 struct radv_amdgpu_bo *b = *(struct radv_amdgpu_bo *const *)bp;
59 return (a > b) ? 1 : (a < b) ? -1 : 0;
60 }
61
62 static VkResult
radv_amdgpu_winsys_rebuild_bo_list(struct radv_amdgpu_winsys_bo * bo)63 radv_amdgpu_winsys_rebuild_bo_list(struct radv_amdgpu_winsys_bo *bo)
64 {
65 u_rwlock_wrlock(&bo->lock);
66
67 if (bo->bo_capacity < bo->range_count) {
68 uint32_t new_count = MAX2(bo->bo_capacity * 2, bo->range_count);
69 struct radv_amdgpu_winsys_bo **bos = realloc(bo->bos, new_count * sizeof(struct radv_amdgpu_winsys_bo *));
70 if (!bos) {
71 u_rwlock_wrunlock(&bo->lock);
72 return VK_ERROR_OUT_OF_HOST_MEMORY;
73 }
74 bo->bos = bos;
75 bo->bo_capacity = new_count;
76 }
77
78 uint32_t temp_bo_count = 0;
79 for (uint32_t i = 0; i < bo->range_count; ++i)
80 if (bo->ranges[i].bo)
81 bo->bos[temp_bo_count++] = bo->ranges[i].bo;
82
83 qsort(bo->bos, temp_bo_count, sizeof(struct radv_amdgpu_winsys_bo *), &bo_comparator);
84
85 if (!temp_bo_count) {
86 bo->bo_count = 0;
87 } else {
88 uint32_t final_bo_count = 1;
89 for (uint32_t i = 1; i < temp_bo_count; ++i)
90 if (bo->bos[i] != bo->bos[i - 1])
91 bo->bos[final_bo_count++] = bo->bos[i];
92
93 bo->bo_count = final_bo_count;
94 }
95
96 u_rwlock_wrunlock(&bo->lock);
97 return VK_SUCCESS;
98 }
99
100 static void
radv_amdgpu_log_va_op(struct radv_amdgpu_winsys * ws,struct radv_amdgpu_winsys_bo * bo,uint64_t offset,uint64_t size,uint64_t virtual_va)101 radv_amdgpu_log_va_op(struct radv_amdgpu_winsys *ws,
102 struct radv_amdgpu_winsys_bo *bo, uint64_t offset, uint64_t size,
103 uint64_t virtual_va)
104 {
105 struct radv_amdgpu_winsys_bo_log *bo_log = NULL;
106
107 if (!ws->debug_log_bos)
108 return;
109
110 bo_log = calloc(1, sizeof(*bo_log));
111 if (!bo_log)
112 return;
113
114 bo_log->va = virtual_va;
115 bo_log->size = size;
116 bo_log->timestamp = os_time_get_nano();
117 bo_log->virtual_mapping = 1;
118 bo_log->mapped_va = bo ? (bo->base.va + offset) : 0;
119
120 u_rwlock_wrlock(&ws->log_bo_list_lock);
121 list_addtail(&bo_log->list, &ws->log_bo_list);
122 u_rwlock_wrunlock(&ws->log_bo_list_lock);
123 }
124
125 static VkResult
radv_amdgpu_winsys_bo_virtual_bind(struct radeon_winsys * _ws,struct radeon_winsys_bo * _parent,uint64_t offset,uint64_t size,struct radeon_winsys_bo * _bo,uint64_t bo_offset)126 radv_amdgpu_winsys_bo_virtual_bind(struct radeon_winsys *_ws, struct radeon_winsys_bo *_parent, uint64_t offset,
127 uint64_t size, struct radeon_winsys_bo *_bo, uint64_t bo_offset)
128 {
129 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
130 struct radv_amdgpu_winsys_bo *parent = (struct radv_amdgpu_winsys_bo *)_parent;
131 struct radv_amdgpu_winsys_bo *bo = (struct radv_amdgpu_winsys_bo *)_bo;
132 int range_count_delta, new_idx;
133 int first = 0, last;
134 struct radv_amdgpu_map_range new_first, new_last;
135 VkResult result;
136 int r;
137
138 assert(parent->is_virtual);
139 assert(!bo || !bo->is_virtual);
140
141 /* When the BO is NULL, AMDGPU will reset the PTE VA range to the initial state. Otherwise, it
142 * will first unmap all existing VA that overlap the requested range and then map.
143 */
144 if (bo) {
145 r =
146 radv_amdgpu_bo_va_op(ws, bo->bo_handle, bo_offset, size, parent->base.va + offset, 0, 0, AMDGPU_VA_OP_REPLACE);
147 radv_amdgpu_log_va_op(ws, bo, bo_offset, size, parent->base.va + offset);
148 } else {
149 r = radv_amdgpu_bo_va_op(ws, 0, 0, size, parent->base.va + offset, 0, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
150 radv_amdgpu_log_va_op(ws, NULL, 0, size, parent->base.va + offset);
151 }
152
153 if (r) {
154 fprintf(stderr, "radv/amdgpu: Failed to replace a PRT VA region (%d).\n", r);
155 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
156 }
157
158 /* Do not add the BO to the virtual BO list if it's already in the global list to avoid dangling
159 * BO references because it might have been destroyed without being previously unbound. Resetting
160 * it to NULL clears the old BO ranges if present.
161 *
162 * This is going to be clarified in the Vulkan spec:
163 * https://gitlab.khronos.org/vulkan/vulkan/-/issues/3125
164 *
165 * The issue still exists for non-global BO but it will be addressed later, once we are 100% it's
166 * RADV fault (mostly because the solution looks more complicated).
167 */
168 if (bo && radv_buffer_is_resident(&bo->base)) {
169 bo = NULL;
170 bo_offset = 0;
171 }
172
173 /* We have at most 2 new ranges (1 by the bind, and another one by splitting a range that
174 * contains the newly bound range). */
175 if (parent->range_capacity - parent->range_count < 2) {
176 uint32_t range_capacity = parent->range_capacity + 2;
177 struct radv_amdgpu_map_range *ranges =
178 realloc(parent->ranges, range_capacity * sizeof(struct radv_amdgpu_map_range));
179 if (!ranges)
180 return VK_ERROR_OUT_OF_HOST_MEMORY;
181 parent->ranges = ranges;
182 parent->range_capacity = range_capacity;
183 }
184
185 /*
186 * [first, last] is exactly the range of ranges that either overlap the
187 * new parent, or are adjacent to it. This corresponds to the bind ranges
188 * that may change.
189 */
190 while (first + 1 < parent->range_count && parent->ranges[first].offset + parent->ranges[first].size < offset)
191 ++first;
192
193 last = first;
194 while (last + 1 < parent->range_count && parent->ranges[last + 1].offset <= offset + size)
195 ++last;
196
197 /* Whether the first or last range are going to be totally removed or just
198 * resized/left alone. Note that in the case of first == last, we will split
199 * this into a part before and after the new range. The remove flag is then
200 * whether to not create the corresponding split part. */
201 bool remove_first = parent->ranges[first].offset == offset;
202 bool remove_last = parent->ranges[last].offset + parent->ranges[last].size == offset + size;
203
204 assert(parent->ranges[first].offset <= offset);
205 assert(parent->ranges[last].offset + parent->ranges[last].size >= offset + size);
206
207 /* Try to merge the new range with the first range. */
208 if (parent->ranges[first].bo == bo &&
209 (!bo || offset - bo_offset == parent->ranges[first].offset - parent->ranges[first].bo_offset)) {
210 size += offset - parent->ranges[first].offset;
211 offset = parent->ranges[first].offset;
212 bo_offset = parent->ranges[first].bo_offset;
213 remove_first = true;
214 }
215
216 /* Try to merge the new range with the last range. */
217 if (parent->ranges[last].bo == bo &&
218 (!bo || offset - bo_offset == parent->ranges[last].offset - parent->ranges[last].bo_offset)) {
219 size = parent->ranges[last].offset + parent->ranges[last].size - offset;
220 remove_last = true;
221 }
222
223 range_count_delta = 1 - (last - first + 1) + !remove_first + !remove_last;
224 new_idx = first + !remove_first;
225
226 /* If the first/last range are not left alone we unmap then and optionally map
227 * them again after modifications. Not that this implicitly can do the splitting
228 * if first == last. */
229 new_first = parent->ranges[first];
230 new_last = parent->ranges[last];
231
232 if (parent->ranges[first].offset + parent->ranges[first].size > offset || remove_first) {
233 if (!remove_first) {
234 new_first.size = offset - new_first.offset;
235 }
236 }
237
238 if (parent->ranges[last].offset < offset + size || remove_last) {
239 if (!remove_last) {
240 new_last.size -= offset + size - new_last.offset;
241 new_last.bo_offset += (offset + size - new_last.offset);
242 new_last.offset = offset + size;
243 }
244 }
245
246 /* Moves the range list after last to account for the changed number of ranges. */
247 memmove(parent->ranges + last + 1 + range_count_delta, parent->ranges + last + 1,
248 sizeof(struct radv_amdgpu_map_range) * (parent->range_count - last - 1));
249
250 if (!remove_first)
251 parent->ranges[first] = new_first;
252
253 if (!remove_last)
254 parent->ranges[new_idx + 1] = new_last;
255
256 /* Actually set up the new range. */
257 parent->ranges[new_idx].offset = offset;
258 parent->ranges[new_idx].size = size;
259 parent->ranges[new_idx].bo = bo;
260 parent->ranges[new_idx].bo_offset = bo_offset;
261
262 parent->range_count += range_count_delta;
263
264 result = radv_amdgpu_winsys_rebuild_bo_list(parent);
265 if (result != VK_SUCCESS)
266 return result;
267
268 return VK_SUCCESS;
269 }
270
271 static void
radv_amdgpu_log_bo(struct radv_amdgpu_winsys * ws,struct radv_amdgpu_winsys_bo * bo,bool destroyed)272 radv_amdgpu_log_bo(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo, bool destroyed)
273 {
274 struct radv_amdgpu_winsys_bo_log *bo_log = NULL;
275
276 if (!ws->debug_log_bos)
277 return;
278
279 bo_log = calloc(1, sizeof(*bo_log));
280 if (!bo_log)
281 return;
282
283 bo_log->va = bo->base.va;
284 bo_log->size = bo->base.size;
285 bo_log->timestamp = os_time_get_nano();
286 bo_log->is_virtual = bo->is_virtual;
287 bo_log->destroyed = destroyed;
288
289 u_rwlock_wrlock(&ws->log_bo_list_lock);
290 list_addtail(&bo_log->list, &ws->log_bo_list);
291 u_rwlock_wrunlock(&ws->log_bo_list_lock);
292 }
293
294 static int
radv_amdgpu_global_bo_list_add(struct radv_amdgpu_winsys * ws,struct radv_amdgpu_winsys_bo * bo)295 radv_amdgpu_global_bo_list_add(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo)
296 {
297 u_rwlock_wrlock(&ws->global_bo_list.lock);
298 if (ws->global_bo_list.count == ws->global_bo_list.capacity) {
299 unsigned capacity = MAX2(4, ws->global_bo_list.capacity * 2);
300 void *data = realloc(ws->global_bo_list.bos, capacity * sizeof(struct radv_amdgpu_winsys_bo *));
301 if (!data) {
302 u_rwlock_wrunlock(&ws->global_bo_list.lock);
303 return VK_ERROR_OUT_OF_HOST_MEMORY;
304 }
305
306 ws->global_bo_list.bos = (struct radv_amdgpu_winsys_bo **)data;
307 ws->global_bo_list.capacity = capacity;
308 }
309
310 ws->global_bo_list.bos[ws->global_bo_list.count++] = bo;
311 bo->base.use_global_list = true;
312 u_rwlock_wrunlock(&ws->global_bo_list.lock);
313 return VK_SUCCESS;
314 }
315
316 static void
radv_amdgpu_global_bo_list_del(struct radv_amdgpu_winsys * ws,struct radv_amdgpu_winsys_bo * bo)317 radv_amdgpu_global_bo_list_del(struct radv_amdgpu_winsys *ws, struct radv_amdgpu_winsys_bo *bo)
318 {
319 u_rwlock_wrlock(&ws->global_bo_list.lock);
320 for (unsigned i = ws->global_bo_list.count; i-- > 0;) {
321 if (ws->global_bo_list.bos[i] == bo) {
322 ws->global_bo_list.bos[i] = ws->global_bo_list.bos[ws->global_bo_list.count - 1];
323 --ws->global_bo_list.count;
324 bo->base.use_global_list = false;
325 break;
326 }
327 }
328 u_rwlock_wrunlock(&ws->global_bo_list.lock);
329 }
330
331 static void
radv_amdgpu_winsys_bo_destroy(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo)332 radv_amdgpu_winsys_bo_destroy(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo)
333 {
334 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
335 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
336
337 radv_amdgpu_log_bo(ws, bo, true);
338
339 if (bo->is_virtual) {
340 int r;
341
342 /* Clear mappings of this PRT VA region. */
343 r = radv_amdgpu_bo_va_op(ws, 0, 0, bo->base.size, bo->base.va, 0, 0, AMDGPU_VA_OP_CLEAR);
344 if (r) {
345 fprintf(stderr, "radv/amdgpu: Failed to clear a PRT VA region (%d).\n", r);
346 }
347
348 free(bo->bos);
349 free(bo->ranges);
350 u_rwlock_destroy(&bo->lock);
351 } else {
352 if (bo->cpu_map)
353 munmap(bo->cpu_map, bo->base.size);
354
355 if (ws->debug_all_bos)
356 radv_amdgpu_global_bo_list_del(ws, bo);
357 radv_amdgpu_bo_va_op(ws, bo->bo_handle, 0, bo->base.size, bo->base.va, 0, 0, AMDGPU_VA_OP_UNMAP);
358 ac_drm_bo_free(ws->dev, bo->bo);
359 }
360
361 if (bo->base.initial_domain & RADEON_DOMAIN_VRAM) {
362 if (bo->base.vram_no_cpu_access) {
363 p_atomic_add(&ws->allocated_vram, -align64(bo->base.size, ws->info.gart_page_size));
364 } else {
365 p_atomic_add(&ws->allocated_vram_vis, -align64(bo->base.size, ws->info.gart_page_size));
366 }
367 }
368
369 if (bo->base.initial_domain & RADEON_DOMAIN_GTT)
370 p_atomic_add(&ws->allocated_gtt, -align64(bo->base.size, ws->info.gart_page_size));
371
372 ac_drm_va_range_free(bo->va_handle);
373 FREE(bo);
374 }
375
376 static VkResult
radv_amdgpu_winsys_bo_create(struct radeon_winsys * _ws,uint64_t size,unsigned alignment,enum radeon_bo_domain initial_domain,enum radeon_bo_flag flags,unsigned priority,uint64_t replay_address,struct radeon_winsys_bo ** out_bo)377 radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws, uint64_t size, unsigned alignment,
378 enum radeon_bo_domain initial_domain, enum radeon_bo_flag flags, unsigned priority,
379 uint64_t replay_address, struct radeon_winsys_bo **out_bo)
380 {
381 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
382 struct radv_amdgpu_winsys_bo *bo;
383 struct amdgpu_bo_alloc_request request = {0};
384 struct radv_amdgpu_map_range *ranges = NULL;
385 ac_drm_bo buf_handle;
386 uint64_t va = 0;
387 amdgpu_va_handle va_handle;
388 int r;
389 VkResult result = VK_SUCCESS;
390
391 /* Just be robust for callers that might use NULL-ness for determining if things should be freed.
392 */
393 *out_bo = NULL;
394
395 bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo);
396 if (!bo) {
397 return VK_ERROR_OUT_OF_HOST_MEMORY;
398 }
399
400 unsigned virt_alignment = alignment;
401 if (size >= ws->info.pte_fragment_size)
402 virt_alignment = MAX2(virt_alignment, ws->info.pte_fragment_size);
403
404 assert(!replay_address || (flags & RADEON_FLAG_REPLAYABLE));
405
406 const uint64_t va_flags = AMDGPU_VA_RANGE_HIGH | (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
407 (flags & RADEON_FLAG_REPLAYABLE ? AMDGPU_VA_RANGE_REPLAYABLE : 0);
408 r = ac_drm_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, size, virt_alignment, replay_address, &va,
409 &va_handle, va_flags);
410 if (r) {
411 result = replay_address ? VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS : VK_ERROR_OUT_OF_DEVICE_MEMORY;
412 goto error_va_alloc;
413 }
414
415 bo->base.va = va;
416 bo->base.size = size;
417 bo->va_handle = va_handle;
418 bo->is_virtual = !!(flags & RADEON_FLAG_VIRTUAL);
419
420 if (flags & RADEON_FLAG_VIRTUAL) {
421 ranges = realloc(NULL, sizeof(struct radv_amdgpu_map_range));
422 if (!ranges) {
423 result = VK_ERROR_OUT_OF_HOST_MEMORY;
424 goto error_ranges_alloc;
425 }
426
427 u_rwlock_init(&bo->lock);
428
429 bo->ranges = ranges;
430 bo->range_count = 1;
431 bo->range_capacity = 1;
432
433 bo->ranges[0].offset = 0;
434 bo->ranges[0].size = size;
435 bo->ranges[0].bo = NULL;
436 bo->ranges[0].bo_offset = 0;
437
438 /* Reserve a PRT VA region. */
439 r = radv_amdgpu_bo_va_op(ws, 0, 0, size, bo->base.va, 0, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
440 if (r) {
441 fprintf(stderr, "radv/amdgpu: Failed to reserve a PRT VA region (%d).\n", r);
442 result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
443 goto error_ranges_alloc;
444 }
445
446 radv_amdgpu_log_bo(ws, bo, false);
447
448 *out_bo = (struct radeon_winsys_bo *)bo;
449 return VK_SUCCESS;
450 }
451
452 request.alloc_size = size;
453 request.phys_alignment = alignment;
454
455 if (initial_domain & RADEON_DOMAIN_VRAM) {
456 request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
457
458 /* Since VRAM and GTT have almost the same performance on
459 * APUs, we could just set GTT. However, in order to decrease
460 * GTT(RAM) usage, which is shared with the OS, allow VRAM
461 * placements too. The idea is not to use VRAM usefully, but
462 * to use it so that it's not unused and wasted.
463 *
464 * Furthermore, even on discrete GPUs this is beneficial. If
465 * both GTT and VRAM are set then AMDGPU still prefers VRAM
466 * for the initial placement, but it makes the buffers
467 * spillable. Otherwise AMDGPU tries to place the buffers in
468 * VRAM really hard to the extent that we are getting a lot
469 * of unnecessary movement. This helps significantly when
470 * e.g. Horizon Zero Dawn allocates more memory than we have
471 * VRAM.
472 */
473 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
474 }
475
476 if (initial_domain & RADEON_DOMAIN_GTT)
477 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
478 if (initial_domain & RADEON_DOMAIN_GDS)
479 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;
480 if (initial_domain & RADEON_DOMAIN_OA)
481 request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;
482
483 if (flags & RADEON_FLAG_CPU_ACCESS)
484 request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
485 if (flags & RADEON_FLAG_NO_CPU_ACCESS) {
486 bo->base.vram_no_cpu_access = initial_domain & RADEON_DOMAIN_VRAM;
487 request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
488 }
489 if (flags & RADEON_FLAG_GTT_WC)
490 request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
491 if (!(flags & RADEON_FLAG_IMPLICIT_SYNC))
492 request.flags |= AMDGPU_GEM_CREATE_EXPLICIT_SYNC;
493 if ((initial_domain & RADEON_DOMAIN_VRAM_GTT) && (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) &&
494 ((ws->perftest & RADV_PERFTEST_LOCAL_BOS) || (flags & RADEON_FLAG_PREFER_LOCAL_BO))) {
495 /* virtio needs to be able to create a dmabuf if CPU access is required but a
496 * dmabuf cannot be created if VM_ALWAYS_VALID is used.
497 */
498 if (!ws->info.is_virtio || (request.flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)) {
499 bo->base.is_local = true;
500 request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
501 }
502 }
503 /* Set AMDGPU_GEM_CREATE_VIRTIO_SHARED if the driver didn't disable buffer sharing. */
504 if (ws->info.is_virtio && (initial_domain & RADEON_DOMAIN_VRAM_GTT) &&
505 (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) == 0)
506 request.flags |= AMDGPU_GEM_CREATE_VIRTIO_SHARED;
507 if (initial_domain & RADEON_DOMAIN_VRAM) {
508 if (ws->zero_all_vram_allocs || (flags & RADEON_FLAG_ZERO_VRAM))
509 request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
510 }
511
512 if (flags & RADEON_FLAG_DISCARDABLE && ws->info.drm_minor >= 47)
513 request.flags |= AMDGPU_GEM_CREATE_DISCARDABLE;
514
515 r = ac_drm_bo_alloc(ws->dev, &request, &buf_handle);
516 if (r) {
517 fprintf(stderr, "radv/amdgpu: Failed to allocate a buffer:\n");
518 fprintf(stderr, "radv/amdgpu: size : %" PRIu64 " bytes\n", size);
519 fprintf(stderr, "radv/amdgpu: alignment : %u bytes\n", alignment);
520 fprintf(stderr, "radv/amdgpu: domains : %u\n", initial_domain);
521 result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
522 goto error_bo_alloc;
523 }
524
525 uint32_t kms_handle = 0;
526 r = ac_drm_bo_export(ws->dev, buf_handle, amdgpu_bo_handle_type_kms, &kms_handle);
527 assert(!r);
528
529 r = radv_amdgpu_bo_va_op(ws, kms_handle, 0, size, va, flags, 0, AMDGPU_VA_OP_MAP);
530 if (r) {
531 result = VK_ERROR_UNKNOWN;
532 goto error_va_map;
533 }
534
535 bo->bo = buf_handle;
536 bo->bo_handle = kms_handle;
537 bo->base.initial_domain = initial_domain;
538 bo->base.use_global_list = false;
539 bo->priority = priority;
540 bo->cpu_map = NULL;
541
542 if (initial_domain & RADEON_DOMAIN_VRAM) {
543 /* Buffers allocated in VRAM with the NO_CPU_ACCESS flag
544 * aren't mappable and they are counted as part of the VRAM
545 * counter.
546 *
547 * Otherwise, buffers with the CPU_ACCESS flag or without any
548 * of both (imported buffers) are counted as part of the VRAM
549 * visible counter because they can be mapped.
550 */
551 if (bo->base.vram_no_cpu_access) {
552 p_atomic_add(&ws->allocated_vram, align64(bo->base.size, ws->info.gart_page_size));
553 } else {
554 p_atomic_add(&ws->allocated_vram_vis, align64(bo->base.size, ws->info.gart_page_size));
555 }
556 }
557
558 if (initial_domain & RADEON_DOMAIN_GTT)
559 p_atomic_add(&ws->allocated_gtt, align64(bo->base.size, ws->info.gart_page_size));
560
561 if (ws->debug_all_bos)
562 radv_amdgpu_global_bo_list_add(ws, bo);
563 radv_amdgpu_log_bo(ws, bo, false);
564
565 *out_bo = (struct radeon_winsys_bo *)bo;
566 return VK_SUCCESS;
567 error_va_map:
568 ac_drm_bo_free(ws->dev, buf_handle);
569
570 error_bo_alloc:
571 free(ranges);
572
573 error_ranges_alloc:
574 ac_drm_va_range_free(va_handle);
575
576 error_va_alloc:
577 FREE(bo);
578 return result;
579 }
580
581 static void *
radv_amdgpu_winsys_bo_map(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,bool use_fixed_addr,void * fixed_addr)582 radv_amdgpu_winsys_bo_map(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, bool use_fixed_addr,
583 void *fixed_addr)
584 {
585 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
586
587 /* Safeguard for the Quantic Dream layer skipping unmaps. */
588 if (bo->cpu_map && !use_fixed_addr)
589 return bo->cpu_map;
590
591 assert(!bo->cpu_map);
592
593 #if HAVE_AMDGPU_VIRTIO
594 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
595 if (ws->info.is_virtio) {
596 /* We can't use DRM_AMDGPU_GEM_MMAP directly on virtio. Instead use bo_cpu_map since
597 * the virtio version will map the buffer at the given address (if not NULL).
598 */
599 void *data = NULL;
600 if (use_fixed_addr)
601 data = fixed_addr;
602
603 if (ac_drm_bo_cpu_map(ws->dev, bo->bo, &data))
604 return NULL;
605 return data;
606 }
607 #endif
608
609 union drm_amdgpu_gem_mmap args;
610 memset(&args, 0, sizeof(args));
611 args.in.handle = bo->bo_handle;
612
613 int ret = drm_ioctl_write_read(radv_amdgpu_winsys(_ws)->fd, DRM_AMDGPU_GEM_MMAP, &args, sizeof(args));
614 if (ret)
615 return NULL;
616
617 void *data = mmap(fixed_addr, bo->base.size, PROT_READ | PROT_WRITE, MAP_SHARED | (use_fixed_addr ? MAP_FIXED : 0),
618 radv_amdgpu_winsys(_ws)->fd, args.out.addr_ptr);
619 if (data == MAP_FAILED)
620 return NULL;
621
622 bo->cpu_map = data;
623 return data;
624 }
625
626 static void
radv_amdgpu_winsys_bo_unmap(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,bool replace)627 radv_amdgpu_winsys_bo_unmap(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, bool replace)
628 {
629 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
630
631 /* Defense in depth against buggy apps. */
632 if (!bo->cpu_map && !replace)
633 return;
634
635 assert(bo->cpu_map);
636 if (replace) {
637 (void)mmap(bo->cpu_map, bo->base.size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
638 } else {
639 #if HAVE_AMDGPU_VIRTIO
640 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
641 if (ws->info.is_virtio)
642 ac_drm_bo_cpu_unmap(ws->dev, bo->bo);
643 else
644 #endif
645 munmap(bo->cpu_map, bo->base.size);
646 }
647 bo->cpu_map = NULL;
648 }
649
650 static uint64_t
radv_amdgpu_get_optimal_vm_alignment(struct radv_amdgpu_winsys * ws,uint64_t size,unsigned alignment)651 radv_amdgpu_get_optimal_vm_alignment(struct radv_amdgpu_winsys *ws, uint64_t size, unsigned alignment)
652 {
653 uint64_t vm_alignment = alignment;
654
655 /* Increase the VM alignment for faster address translation. */
656 if (size >= ws->info.pte_fragment_size)
657 vm_alignment = MAX2(vm_alignment, ws->info.pte_fragment_size);
658
659 /* Gfx9: Increase the VM alignment to the most significant bit set
660 * in the size for faster address translation.
661 */
662 if (ws->info.gfx_level >= GFX9) {
663 unsigned msb = util_last_bit64(size); /* 0 = no bit is set */
664 uint64_t msb_alignment = msb ? 1ull << (msb - 1) : 0;
665
666 vm_alignment = MAX2(vm_alignment, msb_alignment);
667 }
668 return vm_alignment;
669 }
670
671 static VkResult
radv_amdgpu_winsys_bo_from_ptr(struct radeon_winsys * _ws,void * pointer,uint64_t size,unsigned priority,struct radeon_winsys_bo ** out_bo)672 radv_amdgpu_winsys_bo_from_ptr(struct radeon_winsys *_ws, void *pointer, uint64_t size, unsigned priority,
673 struct radeon_winsys_bo **out_bo)
674 {
675 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
676 ac_drm_bo buf_handle;
677 struct radv_amdgpu_winsys_bo *bo;
678 uint64_t va;
679 amdgpu_va_handle va_handle;
680 uint64_t vm_alignment;
681 VkResult result = VK_SUCCESS;
682 int ret;
683
684 /* Just be robust for callers that might use NULL-ness for determining if things should be freed.
685 */
686 *out_bo = NULL;
687
688 bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo);
689 if (!bo)
690 return VK_ERROR_OUT_OF_HOST_MEMORY;
691
692 ret = ac_drm_create_bo_from_user_mem(ws->dev, pointer, size, &buf_handle);
693 if (ret) {
694 if (ret == -EINVAL) {
695 result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
696 } else {
697 result = VK_ERROR_UNKNOWN;
698 }
699 goto error;
700 }
701
702 /* Using the optimal VM alignment also fixes GPU hangs for buffers that
703 * are imported.
704 */
705 vm_alignment = radv_amdgpu_get_optimal_vm_alignment(ws, size, ws->info.gart_page_size);
706
707 if (ac_drm_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, size, vm_alignment, 0, &va, &va_handle,
708 AMDGPU_VA_RANGE_HIGH)) {
709 result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
710 goto error_va_alloc;
711 }
712
713 uint32_t kms_handle = 0;
714 ASSERTED int r = ac_drm_bo_export(ws->dev, buf_handle, amdgpu_bo_handle_type_kms, &kms_handle);
715 assert(!r);
716
717 if (ac_drm_bo_va_op(ws->dev, kms_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP)) {
718 result = VK_ERROR_UNKNOWN;
719 goto error_va_map;
720 }
721
722 /* Initialize it */
723 bo->base.va = va;
724 bo->va_handle = va_handle;
725 bo->base.size = size;
726 bo->bo = buf_handle;
727 bo->bo_handle = kms_handle;
728 bo->base.initial_domain = RADEON_DOMAIN_GTT;
729 bo->base.use_global_list = false;
730 bo->priority = priority;
731 bo->cpu_map = NULL;
732
733 p_atomic_add(&ws->allocated_gtt, align64(bo->base.size, ws->info.gart_page_size));
734
735 if (ws->debug_all_bos)
736 radv_amdgpu_global_bo_list_add(ws, bo);
737 radv_amdgpu_log_bo(ws, bo, false);
738
739 *out_bo = (struct radeon_winsys_bo *)bo;
740 return VK_SUCCESS;
741
742 error_va_map:
743 ac_drm_va_range_free(va_handle);
744
745 error_va_alloc:
746 ac_drm_bo_free(ws->dev, buf_handle);
747
748 error:
749 FREE(bo);
750 return result;
751 }
752
753 static VkResult
radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys * _ws,int fd,unsigned priority,struct radeon_winsys_bo ** out_bo,uint64_t * alloc_size)754 radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys *_ws, int fd, unsigned priority, struct radeon_winsys_bo **out_bo,
755 uint64_t *alloc_size)
756 {
757 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
758 struct radv_amdgpu_winsys_bo *bo;
759 uint64_t va;
760 amdgpu_va_handle va_handle;
761 enum amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd;
762 struct ac_drm_bo_import_result result;
763 struct amdgpu_bo_info info;
764 enum radeon_bo_domain initial = 0;
765 int r;
766 VkResult vk_result = VK_SUCCESS;
767
768 /* Just be robust for callers that might use NULL-ness for determining if things should be freed.
769 */
770 *out_bo = NULL;
771
772 bo = CALLOC_STRUCT(radv_amdgpu_winsys_bo);
773 if (!bo)
774 return VK_ERROR_OUT_OF_HOST_MEMORY;
775
776 r = ac_drm_bo_import(ws->dev, type, fd, &result);
777 if (r) {
778 vk_result = VK_ERROR_INVALID_EXTERNAL_HANDLE;
779 goto error;
780 }
781
782 uint32_t kms_handle = 0;
783 r = ac_drm_bo_export(ws->dev, result.bo, amdgpu_bo_handle_type_kms, &kms_handle);
784 assert(!r);
785
786 r = ac_drm_bo_query_info(ws->dev, kms_handle, &info);
787 if (r) {
788 vk_result = VK_ERROR_UNKNOWN;
789 goto error_query;
790 }
791
792 if (alloc_size) {
793 *alloc_size = info.alloc_size;
794 }
795
796 r = ac_drm_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, result.alloc_size, 1 << 20, 0, &va, &va_handle,
797 AMDGPU_VA_RANGE_HIGH);
798 if (r) {
799 vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
800 goto error_query;
801 }
802
803 r = radv_amdgpu_bo_va_op(ws, kms_handle, 0, result.alloc_size, va, 0, 0, AMDGPU_VA_OP_MAP);
804 if (r) {
805 vk_result = VK_ERROR_UNKNOWN;
806 goto error_va_map;
807 }
808
809 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
810 initial |= RADEON_DOMAIN_VRAM;
811 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
812 initial |= RADEON_DOMAIN_GTT;
813
814 bo->bo = result.bo;
815 bo->bo_handle = kms_handle;
816 bo->base.va = va;
817 bo->va_handle = va_handle;
818 bo->base.initial_domain = initial;
819 bo->base.use_global_list = false;
820 bo->base.size = result.alloc_size;
821 bo->priority = priority;
822 bo->cpu_map = NULL;
823
824 if (bo->base.initial_domain & RADEON_DOMAIN_VRAM)
825 p_atomic_add(&ws->allocated_vram, align64(bo->base.size, ws->info.gart_page_size));
826 if (bo->base.initial_domain & RADEON_DOMAIN_GTT)
827 p_atomic_add(&ws->allocated_gtt, align64(bo->base.size, ws->info.gart_page_size));
828
829 if (ws->debug_all_bos)
830 radv_amdgpu_global_bo_list_add(ws, bo);
831 radv_amdgpu_log_bo(ws, bo, false);
832
833 *out_bo = (struct radeon_winsys_bo *)bo;
834 return VK_SUCCESS;
835 error_va_map:
836 ac_drm_va_range_free(va_handle);
837
838 error_query:
839 ac_drm_bo_free(ws->dev, result.bo);
840
841 error:
842 FREE(bo);
843 return vk_result;
844 }
845
846 static bool
radv_amdgpu_winsys_get_fd(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,int * fd)847 radv_amdgpu_winsys_get_fd(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, int *fd)
848 {
849 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
850 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
851 enum amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd;
852 int r;
853 unsigned handle;
854 r = ac_drm_bo_export(ws->dev, bo->bo, type, &handle);
855 if (r)
856 return false;
857
858 *fd = (int)handle;
859 return true;
860 }
861
862 static bool
radv_amdgpu_bo_get_flags_from_fd(struct radeon_winsys * _ws,int fd,enum radeon_bo_domain * domains,enum radeon_bo_flag * flags)863 radv_amdgpu_bo_get_flags_from_fd(struct radeon_winsys *_ws, int fd, enum radeon_bo_domain *domains,
864 enum radeon_bo_flag *flags)
865 {
866 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
867 struct ac_drm_bo_import_result result = {0};
868 struct amdgpu_bo_info info = {0};
869 int r;
870
871 *domains = 0;
872 *flags = 0;
873
874 r = ac_drm_bo_import(ws->dev, amdgpu_bo_handle_type_dma_buf_fd, fd, &result);
875 if (r)
876 return false;
877
878 uint32_t kms_handle = 0;
879 r = ac_drm_bo_export(ws->dev, result.bo, amdgpu_bo_handle_type_kms, &kms_handle);
880 assert(!r);
881
882 r = ac_drm_bo_query_info(ws->dev, kms_handle, &info);
883 ac_drm_bo_free(ws->dev, result.bo);
884 if (r)
885 return false;
886
887 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
888 *domains |= RADEON_DOMAIN_VRAM;
889 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
890 *domains |= RADEON_DOMAIN_GTT;
891 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GDS)
892 *domains |= RADEON_DOMAIN_GDS;
893 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_OA)
894 *domains |= RADEON_DOMAIN_OA;
895
896 if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
897 *flags |= RADEON_FLAG_CPU_ACCESS;
898 if (info.alloc_flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)
899 *flags |= RADEON_FLAG_NO_CPU_ACCESS;
900 if (!(info.alloc_flags & AMDGPU_GEM_CREATE_EXPLICIT_SYNC))
901 *flags |= RADEON_FLAG_IMPLICIT_SYNC;
902 if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
903 *flags |= RADEON_FLAG_GTT_WC;
904 if (info.alloc_flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
905 *flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_PREFER_LOCAL_BO;
906 if (info.alloc_flags & AMDGPU_GEM_CREATE_VRAM_CLEARED)
907 *flags |= RADEON_FLAG_ZERO_VRAM;
908 return true;
909 }
910
911 static unsigned
eg_tile_split(unsigned tile_split)912 eg_tile_split(unsigned tile_split)
913 {
914 switch (tile_split) {
915 case 0:
916 tile_split = 64;
917 break;
918 case 1:
919 tile_split = 128;
920 break;
921 case 2:
922 tile_split = 256;
923 break;
924 case 3:
925 tile_split = 512;
926 break;
927 default:
928 case 4:
929 tile_split = 1024;
930 break;
931 case 5:
932 tile_split = 2048;
933 break;
934 case 6:
935 tile_split = 4096;
936 break;
937 }
938 return tile_split;
939 }
940
941 static unsigned
radv_eg_tile_split_rev(unsigned eg_tile_split)942 radv_eg_tile_split_rev(unsigned eg_tile_split)
943 {
944 switch (eg_tile_split) {
945 case 64:
946 return 0;
947 case 128:
948 return 1;
949 case 256:
950 return 2;
951 case 512:
952 return 3;
953 default:
954 case 1024:
955 return 4;
956 case 2048:
957 return 5;
958 case 4096:
959 return 6;
960 }
961 }
962
963 #define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_SHIFT 45
964 #define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_MASK 0x3
965
966 static void
radv_amdgpu_winsys_bo_set_metadata(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,struct radeon_bo_metadata * md)967 radv_amdgpu_winsys_bo_set_metadata(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo,
968 struct radeon_bo_metadata *md)
969 {
970 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
971 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
972 struct amdgpu_bo_metadata metadata = {0};
973 uint64_t tiling_flags = 0;
974
975 if (ws->info.gfx_level >= GFX9) {
976 tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode);
977 tiling_flags |= AMDGPU_TILING_SET(DCC_OFFSET_256B, md->u.gfx9.dcc_offset_256b);
978 tiling_flags |= AMDGPU_TILING_SET(DCC_PITCH_MAX, md->u.gfx9.dcc_pitch_max);
979 tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, md->u.gfx9.dcc_independent_64b_blocks);
980 tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_128B, md->u.gfx9.dcc_independent_128b_blocks);
981 tiling_flags |= AMDGPU_TILING_SET(DCC_MAX_COMPRESSED_BLOCK_SIZE, md->u.gfx9.dcc_max_compressed_block_size);
982 tiling_flags |= AMDGPU_TILING_SET(SCANOUT, md->u.gfx9.scanout);
983 } else {
984 if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED)
985 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
986 else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED)
987 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
988 else
989 tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
990
991 tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config);
992 tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw));
993 tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh));
994 if (md->u.legacy.tile_split)
995 tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, radv_eg_tile_split_rev(md->u.legacy.tile_split));
996 tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea));
997 tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks) - 1);
998
999 if (md->u.legacy.scanout)
1000 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
1001 else
1002 tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
1003 }
1004
1005 metadata.tiling_info = tiling_flags;
1006 metadata.size_metadata = md->size_metadata;
1007 memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
1008
1009 ac_drm_bo_set_metadata(ws->dev, bo->bo_handle, &metadata);
1010 }
1011
1012 static void
radv_amdgpu_winsys_bo_get_metadata(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,struct radeon_bo_metadata * md)1013 radv_amdgpu_winsys_bo_get_metadata(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo,
1014 struct radeon_bo_metadata *md)
1015 {
1016 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1017 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
1018 struct amdgpu_bo_info info = {0};
1019
1020 int r = ac_drm_bo_query_info(ws->dev, bo->bo_handle, &info);
1021 if (r)
1022 return;
1023
1024 uint64_t tiling_flags = info.metadata.tiling_info;
1025
1026 if (ws->info.gfx_level >= GFX9) {
1027 md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE);
1028 md->u.gfx9.scanout = AMDGPU_TILING_GET(tiling_flags, SCANOUT);
1029 } else {
1030 md->u.legacy.microtile = RADEON_LAYOUT_LINEAR;
1031 md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR;
1032
1033 if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */
1034 md->u.legacy.macrotile = RADEON_LAYOUT_TILED;
1035 else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
1036 md->u.legacy.microtile = RADEON_LAYOUT_TILED;
1037
1038 md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG);
1039 md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
1040 md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
1041 md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
1042 md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
1043 md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS);
1044 md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
1045 }
1046
1047 md->size_metadata = info.metadata.size_metadata;
1048 memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
1049 }
1050
1051 static VkResult
radv_amdgpu_winsys_bo_make_resident(struct radeon_winsys * _ws,struct radeon_winsys_bo * _bo,bool resident)1052 radv_amdgpu_winsys_bo_make_resident(struct radeon_winsys *_ws, struct radeon_winsys_bo *_bo, bool resident)
1053 {
1054 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1055 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
1056 VkResult result = VK_SUCCESS;
1057
1058 /* Do not add the BO to the global list if it's a local BO because the
1059 * kernel maintains a list for us.
1060 */
1061 if (bo->base.is_local)
1062 return VK_SUCCESS;
1063
1064 /* Do not add the BO twice to the global list if the allbos debug
1065 * option is enabled.
1066 */
1067 if (ws->debug_all_bos)
1068 return VK_SUCCESS;
1069
1070 if (resident) {
1071 result = radv_amdgpu_global_bo_list_add(ws, bo);
1072 } else {
1073 radv_amdgpu_global_bo_list_del(ws, bo);
1074 }
1075
1076 return result;
1077 }
1078
1079 static int
radv_amdgpu_bo_va_compare(const void * a,const void * b)1080 radv_amdgpu_bo_va_compare(const void *a, const void *b)
1081 {
1082 const struct radv_amdgpu_winsys_bo *bo_a = *(const struct radv_amdgpu_winsys_bo *const *)a;
1083 const struct radv_amdgpu_winsys_bo *bo_b = *(const struct radv_amdgpu_winsys_bo *const *)b;
1084 return bo_a->base.va < bo_b->base.va ? -1 : bo_a->base.va > bo_b->base.va ? 1 : 0;
1085 }
1086
1087 static uint64_t
radv_amdgpu_canonicalize_va(uint64_t va)1088 radv_amdgpu_canonicalize_va(uint64_t va)
1089 {
1090 /* Would be less hardcoded to use addr32_hi (0xffff8000) to generate a mask,
1091 * but there are confusing differences between page fault reports from kernel where
1092 * it seems to report the top 48 bits, where addr32_hi has 47-bits. */
1093 return va & ((1ull << 48) - 1);
1094 }
1095
1096 static void
radv_amdgpu_dump_bo_log(struct radeon_winsys * _ws,FILE * file)1097 radv_amdgpu_dump_bo_log(struct radeon_winsys *_ws, FILE *file)
1098 {
1099 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1100 struct radv_amdgpu_winsys_bo_log *bo_log;
1101
1102 if (!ws->debug_log_bos)
1103 return;
1104
1105 u_rwlock_rdlock(&ws->log_bo_list_lock);
1106 LIST_FOR_EACH_ENTRY (bo_log, &ws->log_bo_list, list) {
1107 if (bo_log->virtual_mapping) {
1108 fprintf(file, "timestamp=%llu, VA=%.16llx-%.16llx, mapped_to=%.16llx\n",
1109 (long long)bo_log->timestamp,
1110 (long long)radv_amdgpu_canonicalize_va(bo_log->va),
1111 (long long)radv_amdgpu_canonicalize_va(bo_log->va + bo_log->size),
1112 (long long)radv_amdgpu_canonicalize_va(bo_log->mapped_va));
1113 } else {
1114 fprintf(file, "timestamp=%llu, VA=%.16llx-%.16llx, destroyed=%d, is_virtual=%d\n",
1115 (long long)bo_log->timestamp,
1116 (long long)radv_amdgpu_canonicalize_va(bo_log->va),
1117 (long long)radv_amdgpu_canonicalize_va(bo_log->va + bo_log->size), bo_log->destroyed,
1118 bo_log->is_virtual);
1119 }
1120 }
1121 u_rwlock_rdunlock(&ws->log_bo_list_lock);
1122 }
1123
1124 static void
radv_amdgpu_dump_bo_ranges(struct radeon_winsys * _ws,FILE * file)1125 radv_amdgpu_dump_bo_ranges(struct radeon_winsys *_ws, FILE *file)
1126 {
1127 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1128 if (ws->debug_all_bos) {
1129 struct radv_amdgpu_winsys_bo **bos = NULL;
1130 int i = 0;
1131
1132 u_rwlock_rdlock(&ws->global_bo_list.lock);
1133 bos = malloc(sizeof(*bos) * ws->global_bo_list.count);
1134 if (!bos) {
1135 u_rwlock_rdunlock(&ws->global_bo_list.lock);
1136 fprintf(file, " Failed to allocate memory to sort VA ranges for dumping\n");
1137 return;
1138 }
1139
1140 for (i = 0; i < ws->global_bo_list.count; i++) {
1141 bos[i] = ws->global_bo_list.bos[i];
1142 }
1143 qsort(bos, ws->global_bo_list.count, sizeof(bos[0]), radv_amdgpu_bo_va_compare);
1144
1145 for (i = 0; i < ws->global_bo_list.count; ++i) {
1146 fprintf(file, " VA=%.16llx-%.16llx, handle=%d\n", (long long)radv_amdgpu_canonicalize_va(bos[i]->base.va),
1147 (long long)radv_amdgpu_canonicalize_va(bos[i]->base.va + bos[i]->base.size), bos[i]->bo_handle);
1148 }
1149 free(bos);
1150 u_rwlock_rdunlock(&ws->global_bo_list.lock);
1151 } else
1152 fprintf(file, " To get BO VA ranges, please specify RADV_DEBUG=allbos\n");
1153 }
1154 void
radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys * ws)1155 radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys *ws)
1156 {
1157 ws->base.buffer_create = radv_amdgpu_winsys_bo_create;
1158 ws->base.buffer_destroy = radv_amdgpu_winsys_bo_destroy;
1159 ws->base.buffer_map = radv_amdgpu_winsys_bo_map;
1160 ws->base.buffer_unmap = radv_amdgpu_winsys_bo_unmap;
1161 ws->base.buffer_from_ptr = radv_amdgpu_winsys_bo_from_ptr;
1162 ws->base.buffer_from_fd = radv_amdgpu_winsys_bo_from_fd;
1163 ws->base.buffer_get_fd = radv_amdgpu_winsys_get_fd;
1164 ws->base.buffer_set_metadata = radv_amdgpu_winsys_bo_set_metadata;
1165 ws->base.buffer_get_metadata = radv_amdgpu_winsys_bo_get_metadata;
1166 ws->base.buffer_virtual_bind = radv_amdgpu_winsys_bo_virtual_bind;
1167 ws->base.buffer_get_flags_from_fd = radv_amdgpu_bo_get_flags_from_fd;
1168 ws->base.buffer_make_resident = radv_amdgpu_winsys_bo_make_resident;
1169 ws->base.dump_bo_ranges = radv_amdgpu_dump_bo_ranges;
1170 ws->base.dump_bo_log = radv_amdgpu_dump_bo_log;
1171 }
1172