1 /*
2 * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
3 * Copyright © 2015 Advanced Micro Devices, Inc.
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include <sys/ioctl.h>
9
10 #include "amdgpu_cs.h"
11
12 #include "util/os_drm.h"
13 #include "util/hash_table.h"
14 #include "util/os_time.h"
15 #include "util/u_hash_table.h"
16 #include "util/u_process.h"
17 #include "frontend/drm_driver.h"
18 #include "drm-uapi/amdgpu_drm.h"
19 #include "drm-uapi/dma-buf.h"
20 #include "sid.h"
21 #include <xf86drm.h>
22 #include <stdio.h>
23 #include <inttypes.h>
24
25 #ifndef AMDGPU_VA_RANGE_HIGH
26 #define AMDGPU_VA_RANGE_HIGH 0x2
27 #endif
28
29 /* Set to 1 for verbose output showing committed sparse buffer ranges. */
30 #define DEBUG_SPARSE_COMMITS 0
31
32 struct amdgpu_sparse_backing_chunk {
33 uint32_t begin, end;
34 };
35
amdgpu_bo_fence_wait(struct amdgpu_winsys * aws,struct pipe_fence_handle ** fence,uint64_t timeout,int64_t abs_timeout)36 static bool amdgpu_bo_fence_wait(struct amdgpu_winsys *aws,
37 struct pipe_fence_handle **fence,
38 uint64_t timeout, int64_t abs_timeout)
39 {
40 if (timeout == 0) {
41 bool idle = amdgpu_fence_wait(*fence, 0, false);
42
43 if (!idle) {
44 simple_mtx_unlock(&aws->bo_fence_lock);
45 return false; /* busy */
46 }
47
48 /* It's idle. Remove it from the ring to skip checking it again later. */
49 amdgpu_fence_reference(fence, NULL);
50 } else {
51 struct pipe_fence_handle *tmp_fence = NULL;
52 amdgpu_fence_reference(&tmp_fence, *fence);
53
54 /* While waiting, unlock the mutex. */
55 simple_mtx_unlock(&aws->bo_fence_lock);
56
57 bool idle = amdgpu_fence_wait(tmp_fence, abs_timeout, true);
58 if (!idle) {
59 amdgpu_fence_reference(&tmp_fence, NULL);
60 return false; /* busy */
61 }
62
63 simple_mtx_lock(&aws->bo_fence_lock);
64 /* It's idle. Remove it from the ring to skip checking it again later. */
65 if (tmp_fence == *fence)
66 amdgpu_fence_reference(fence, NULL);
67 amdgpu_fence_reference(&tmp_fence, NULL);
68 }
69
70 return true;
71 }
72
amdgpu_bo_wait(struct radeon_winsys * rws,struct pb_buffer_lean * _buf,uint64_t timeout,unsigned usage)73 static bool amdgpu_bo_wait(struct radeon_winsys *rws,
74 struct pb_buffer_lean *_buf, uint64_t timeout,
75 unsigned usage)
76 {
77 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
78 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
79 int64_t abs_timeout = 0;
80
81 assert(p_atomic_read(&bo->num_active_ioctls) >= 0);
82
83 if (timeout == 0) {
84 if (p_atomic_read(&bo->num_active_ioctls))
85 return false;
86
87 } else {
88 abs_timeout = os_time_get_absolute_timeout(timeout);
89
90 /* Wait if any ioctl is being submitted with this buffer. */
91 if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
92 return false;
93 }
94
95 if (is_real_bo(bo) && (get_real_bo(bo)->is_shared || get_real_bo(bo)->slab_has_busy_alt_fences)) {
96 /* We can't use user fences for shared buffers, because user fences are local to this
97 * process only. If we want to wait for all buffer uses in all processes, we have to
98 * use amdgpu_bo_wait_for_idle.
99 *
100 * Additionally, if this is a slab buffer and one of the slab entries has non-NULL
101 * alt_fence, we can't easily wait for that here. Instead, use the kernel ioctl to wait
102 * for the buffer.
103 */
104 bool buffer_busy = true;
105 int r;
106
107 /* The GEM_WAIT_IDLE ioctl with timeout=0 can take up to 1 ms to return. This is a kernel
108 * inefficiency. This flag indicates whether it's better to return busy than wait for 1 ms.
109 */
110 if (timeout == 0 && usage & RADEON_USAGE_DISALLOW_SLOW_REPLY)
111 return false;
112
113 r = ac_drm_bo_wait_for_idle(aws->dev, get_real_bo(bo)->bo, timeout, &buffer_busy);
114 if (r)
115 fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__, r);
116
117 if (!buffer_busy)
118 get_real_bo(bo)->slab_has_busy_alt_fences = false;
119 return !buffer_busy;
120 }
121
122 simple_mtx_lock(&aws->bo_fence_lock);
123
124 u_foreach_bit(i, bo->fences.valid_fence_mask) {
125 struct pipe_fence_handle **fence = get_fence_from_ring(aws, &bo->fences, i);
126
127 if (fence) {
128 /* This also unlocks the mutex on failure. */
129 if (!amdgpu_bo_fence_wait(aws, fence, timeout, abs_timeout))
130 return false;
131 }
132
133 bo->fences.valid_fence_mask &= ~BITFIELD_BIT(i); /* remove the fence from the BO */
134 }
135
136 /* Also wait for alt_fence. */
137 if (bo->alt_fence) {
138 /* This also unlocks the mutex on failure. */
139 if (!amdgpu_bo_fence_wait(aws, &bo->alt_fence, timeout, abs_timeout))
140 return false;
141 }
142
143 simple_mtx_unlock(&aws->bo_fence_lock);
144 return true; /* idle */
145 }
146
amdgpu_bo_get_syncobjs(struct amdgpu_winsys * aws,struct amdgpu_winsys_bo * bo,uint32_t * syncobj,uint32_t * num_fences)147 static void amdgpu_bo_get_syncobjs(struct amdgpu_winsys *aws, struct amdgpu_winsys_bo *bo,
148 uint32_t *syncobj, uint32_t *num_fences)
149 {
150 if (p_atomic_read(&bo->num_active_ioctls))
151 os_wait_until_zero(&bo->num_active_ioctls, OS_TIMEOUT_INFINITE);
152
153 simple_mtx_lock(&aws->bo_fence_lock);
154 u_foreach_bit(queue_index, bo->fences.valid_fence_mask) {
155 struct pipe_fence_handle **fence = get_fence_from_ring(aws, &bo->fences, queue_index);
156 if (fence) {
157 if (!amdgpu_fence_wait(*fence, 0, 0)) {
158 syncobj[(*num_fences)++] = ((struct amdgpu_fence*)*fence)->syncobj;
159 } else {
160 amdgpu_fence_reference(fence, NULL);
161 /* remove the fence from the BO */
162 bo->fences.valid_fence_mask &= ~BITFIELD_BIT(queue_index);
163 }
164 }
165 }
166
167 if (bo->alt_fence) {
168 if (!amdgpu_fence_wait(bo->alt_fence, 0, 0))
169 syncobj[(*num_fences)++] = ((struct amdgpu_fence*)bo->alt_fence)->syncobj;
170 else
171 amdgpu_fence_reference(&bo->alt_fence, NULL);
172 }
173 simple_mtx_unlock(&aws->bo_fence_lock);
174 }
175
amdgpu_bo_va_op_common(struct amdgpu_winsys * aws,struct amdgpu_winsys_bo * bo,uint32_t bo_handle,bool send_input_fence,uint64_t * vm_timeline_point,uint64_t offset,uint64_t size,uint64_t addr,uint64_t flags,uint32_t ops)176 static int amdgpu_bo_va_op_common(struct amdgpu_winsys *aws, struct amdgpu_winsys_bo *bo,
177 uint32_t bo_handle, bool send_input_fence,
178 uint64_t *vm_timeline_point, uint64_t offset, uint64_t size,
179 uint64_t addr, uint64_t flags, uint32_t ops)
180 {
181 int r;
182
183 if (aws->info.use_userq) {
184 uint32_t syncobj_arr[AMDGPU_MAX_QUEUES + 1];
185 uint32_t num_fences = 0;
186
187 if (send_input_fence)
188 amdgpu_bo_get_syncobjs(aws, bo, &syncobj_arr[0], &num_fences);
189
190 /* The lock guarantees that the execution ordering of the vm ioctls match the timeline
191 * sequence number ordering.
192 */
193 simple_mtx_lock(&aws->vm_ioctl_lock);
194 aws->vm_timeline_seq_num++;
195 if (vm_timeline_point) {
196 /* Sparse buffers can be updated concurrently by another thread so we use atomic operation
197 * to get a valid seqno.
198 */
199 p_atomic_set(vm_timeline_point, aws->vm_timeline_seq_num);
200 }
201 r = ac_drm_bo_va_op_raw2(aws->dev, bo_handle, offset, size, addr, flags, ops,
202 aws->vm_timeline_syncobj, aws->vm_timeline_seq_num,
203 (uintptr_t)&syncobj_arr, num_fences);
204 simple_mtx_unlock(&aws->vm_ioctl_lock);
205 } else {
206 r = ac_drm_bo_va_op_raw(aws->dev, bo_handle, offset, size, addr, flags, ops);
207 }
208
209 return r;
210 }
211
get_slab_entry_offset(struct amdgpu_winsys_bo * bo)212 static inline unsigned get_slab_entry_offset(struct amdgpu_winsys_bo *bo)
213 {
214 struct amdgpu_bo_slab_entry *slab_entry_bo = get_slab_entry_bo(bo);
215 struct amdgpu_bo_real_reusable_slab *slab_bo =
216 (struct amdgpu_bo_real_reusable_slab *)get_slab_entry_real_bo(bo);
217 unsigned entry_index = slab_entry_bo - slab_bo->entries;
218
219 return slab_bo->slab.entry_size * entry_index;
220 }
221
amdgpu_bo_get_initial_domain(struct pb_buffer_lean * buf)222 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
223 struct pb_buffer_lean *buf)
224 {
225 return ((struct amdgpu_winsys_bo*)buf)->base.placement;
226 }
227
amdgpu_bo_get_flags(struct pb_buffer_lean * buf)228 static enum radeon_bo_flag amdgpu_bo_get_flags(
229 struct pb_buffer_lean *buf)
230 {
231 return ((struct amdgpu_winsys_bo*)buf)->base.usage;
232 }
233
amdgpu_bo_remove_fences(struct amdgpu_winsys_bo * bo)234 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
235 {
236 bo->fences.valid_fence_mask = 0;
237 amdgpu_fence_reference(&bo->alt_fence, NULL);
238 }
239
amdgpu_bo_destroy(struct amdgpu_winsys * aws,struct pb_buffer_lean * _buf)240 void amdgpu_bo_destroy(struct amdgpu_winsys *aws, struct pb_buffer_lean *_buf)
241 {
242 struct amdgpu_bo_real *bo = get_real_bo(amdgpu_winsys_bo(_buf));
243 struct amdgpu_screen_winsys *sws_iter;
244
245 simple_mtx_lock(&aws->bo_export_table_lock);
246
247 /* amdgpu_bo_from_handle might have revived the bo */
248 if (p_atomic_read(&bo->b.base.reference.count)) {
249 simple_mtx_unlock(&aws->bo_export_table_lock);
250 return;
251 }
252
253 _mesa_hash_table_remove_key(aws->bo_export_table, bo->bo.abo);
254
255 if (bo->b.base.placement & RADEON_DOMAIN_VRAM_GTT) {
256 amdgpu_bo_va_op_common(aws, amdgpu_winsys_bo(_buf), bo->kms_handle, true, NULL, 0,
257 bo->b.base.size, amdgpu_va_get_start_addr(bo->va_handle),
258 AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
259 AMDGPU_VM_PAGE_EXECUTABLE, AMDGPU_VA_OP_UNMAP);
260 ac_drm_va_range_free(bo->va_handle);
261 }
262
263 simple_mtx_unlock(&aws->bo_export_table_lock);
264
265 if (!bo->is_user_ptr && bo->cpu_ptr) {
266 bo->cpu_ptr = NULL;
267 amdgpu_bo_unmap(&aws->dummy_sws.base, &bo->b.base);
268 }
269 assert(bo->is_user_ptr || bo->map_count == 0);
270
271 ac_drm_bo_free(aws->dev, bo->bo);
272
273 #if MESA_DEBUG
274 if (aws->debug_all_bos) {
275 simple_mtx_lock(&aws->global_bo_list_lock);
276 list_del(&bo->global_list_item);
277 aws->num_buffers--;
278 simple_mtx_unlock(&aws->global_bo_list_lock);
279 }
280 #endif
281
282 /* Close all KMS handles retrieved for other DRM file descriptions */
283 simple_mtx_lock(&aws->sws_list_lock);
284 for (sws_iter = aws->sws_list; sws_iter; sws_iter = sws_iter->next) {
285 struct hash_entry *entry;
286
287 if (!sws_iter->kms_handles)
288 continue;
289
290 entry = _mesa_hash_table_search(sws_iter->kms_handles, bo);
291 if (entry) {
292 struct drm_gem_close args = { .handle = (uintptr_t)entry->data };
293
294 drm_ioctl(sws_iter->fd, DRM_IOCTL_GEM_CLOSE, &args);
295 _mesa_hash_table_remove(sws_iter->kms_handles, entry);
296 }
297 }
298 simple_mtx_unlock(&aws->sws_list_lock);
299
300 amdgpu_bo_remove_fences(&bo->b);
301
302 if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
303 aws->allocated_vram -= align64(bo->b.base.size, aws->info.gart_page_size);
304 else if (bo->b.base.placement & RADEON_DOMAIN_GTT)
305 aws->allocated_gtt -= align64(bo->b.base.size, aws->info.gart_page_size);
306
307 simple_mtx_destroy(&bo->map_lock);
308 FREE(bo);
309 }
310
amdgpu_bo_destroy_or_cache(struct radeon_winsys * rws,struct pb_buffer_lean * _buf)311 static void amdgpu_bo_destroy_or_cache(struct radeon_winsys *rws, struct pb_buffer_lean *_buf)
312 {
313 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
314 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
315
316 assert(is_real_bo(bo)); /* slab buffers have a separate vtbl */
317
318 if (bo->type >= AMDGPU_BO_REAL_REUSABLE)
319 pb_cache_add_buffer(&aws->bo_cache, &((struct amdgpu_bo_real_reusable*)bo)->cache_entry);
320 else
321 amdgpu_bo_destroy(aws, _buf);
322 }
323
amdgpu_clean_up_buffer_managers(struct amdgpu_winsys * aws)324 static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *aws)
325 {
326 pb_slabs_reclaim(&aws->bo_slabs);
327 pb_cache_release_all_buffers(&aws->bo_cache);
328 }
329
amdgpu_bo_do_map(struct radeon_winsys * rws,struct amdgpu_bo_real * bo,void ** cpu)330 static bool amdgpu_bo_do_map(struct radeon_winsys *rws, struct amdgpu_bo_real *bo, void **cpu)
331 {
332 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
333
334 assert(!bo->is_user_ptr);
335
336 *cpu = NULL;
337 int r = ac_drm_bo_cpu_map(aws->dev, bo->bo, cpu);
338
339 if (r) {
340 /* Clean up buffer managers and try again. */
341 amdgpu_clean_up_buffer_managers(aws);
342 r = ac_drm_bo_cpu_map(aws->dev, bo->bo, cpu);
343 if (r)
344 return false;
345 }
346
347 if (p_atomic_inc_return(&bo->map_count) == 1) {
348 if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
349 aws->mapped_vram += bo->b.base.size;
350 else if (bo->b.base.placement & RADEON_DOMAIN_GTT)
351 aws->mapped_gtt += bo->b.base.size;
352 aws->num_mapped_buffers++;
353 }
354
355 return true;
356 }
357
amdgpu_bo_map(struct radeon_winsys * rws,struct pb_buffer_lean * buf,struct radeon_cmdbuf * rcs,enum pipe_map_flags usage)358 void *amdgpu_bo_map(struct radeon_winsys *rws,
359 struct pb_buffer_lean *buf,
360 struct radeon_cmdbuf *rcs,
361 enum pipe_map_flags usage)
362 {
363 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
364 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
365 struct amdgpu_bo_real *real;
366 struct amdgpu_cs *cs = rcs ? amdgpu_cs(rcs) : NULL;
367
368 assert(bo->type != AMDGPU_BO_SPARSE);
369
370 /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
371 if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
372 /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
373 if (usage & PIPE_MAP_DONTBLOCK) {
374 if (!(usage & PIPE_MAP_WRITE)) {
375 /* Mapping for read.
376 *
377 * Since we are mapping for read, we don't need to wait
378 * if the GPU is using the buffer for read too
379 * (neither one is changing it).
380 *
381 * Only check whether the buffer is being used for write. */
382 if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
383 RADEON_USAGE_WRITE)) {
384 cs->flush_cs(cs->flush_data,
385 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
386 return NULL;
387 }
388
389 if (!amdgpu_bo_wait(rws, (struct pb_buffer_lean*)bo, 0,
390 RADEON_USAGE_WRITE)) {
391 return NULL;
392 }
393 } else {
394 if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
395 cs->flush_cs(cs->flush_data,
396 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
397 return NULL;
398 }
399
400 if (!amdgpu_bo_wait(rws, (struct pb_buffer_lean*)bo, 0,
401 RADEON_USAGE_READWRITE)) {
402 return NULL;
403 }
404 }
405 } else {
406 uint64_t time = os_time_get_nano();
407
408 if (!(usage & PIPE_MAP_WRITE)) {
409 /* Mapping for read.
410 *
411 * Since we are mapping for read, we don't need to wait
412 * if the GPU is using the buffer for read too
413 * (neither one is changing it).
414 *
415 * Only check whether the buffer is being used for write. */
416 if (cs) {
417 if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
418 RADEON_USAGE_WRITE)) {
419 cs->flush_cs(cs->flush_data,
420 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
421 } else {
422 /* Try to avoid busy-waiting in amdgpu_bo_wait. */
423 if (p_atomic_read(&bo->num_active_ioctls))
424 amdgpu_cs_sync_flush(rcs);
425 }
426 }
427
428 amdgpu_bo_wait(rws, (struct pb_buffer_lean*)bo, OS_TIMEOUT_INFINITE,
429 RADEON_USAGE_WRITE);
430 } else {
431 /* Mapping for write. */
432 if (cs) {
433 if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
434 cs->flush_cs(cs->flush_data,
435 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
436 } else {
437 /* Try to avoid busy-waiting in amdgpu_bo_wait. */
438 if (p_atomic_read(&bo->num_active_ioctls))
439 amdgpu_cs_sync_flush(rcs);
440 }
441 }
442
443 amdgpu_bo_wait(rws, (struct pb_buffer_lean*)bo, OS_TIMEOUT_INFINITE,
444 RADEON_USAGE_READWRITE);
445 }
446
447 aws->buffer_wait_time += os_time_get_nano() - time;
448 }
449 }
450
451 /* Buffer synchronization has been checked, now actually map the buffer. */
452 void *cpu = NULL;
453 uint64_t offset = 0;
454
455 if (is_real_bo(bo)) {
456 real = get_real_bo(bo);
457 } else {
458 real = get_slab_entry_real_bo(bo);
459 offset = get_slab_entry_offset(bo);
460 }
461
462 if (usage & RADEON_MAP_TEMPORARY) {
463 if (real->is_user_ptr) {
464 cpu = real->cpu_ptr;
465 } else {
466 if (!amdgpu_bo_do_map(rws, real, &cpu))
467 return NULL;
468 }
469 } else {
470 cpu = p_atomic_read(&real->cpu_ptr);
471 if (!cpu) {
472 simple_mtx_lock(&real->map_lock);
473 /* Must re-check due to the possibility of a race. Re-check need not
474 * be atomic thanks to the lock. */
475 cpu = real->cpu_ptr;
476 if (!cpu) {
477 if (!amdgpu_bo_do_map(rws, real, &cpu)) {
478 simple_mtx_unlock(&real->map_lock);
479 return NULL;
480 }
481 p_atomic_set(&real->cpu_ptr, cpu);
482 }
483 simple_mtx_unlock(&real->map_lock);
484 }
485 }
486
487 return (uint8_t*)cpu + offset;
488 }
489
amdgpu_bo_unmap(struct radeon_winsys * rws,struct pb_buffer_lean * buf)490 void amdgpu_bo_unmap(struct radeon_winsys *rws, struct pb_buffer_lean *buf)
491 {
492 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
493 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
494 struct amdgpu_bo_real *real;
495
496 assert(bo->type != AMDGPU_BO_SPARSE);
497
498 real = is_real_bo(bo) ? get_real_bo(bo) : get_slab_entry_real_bo(bo);
499
500 if (real->is_user_ptr)
501 return;
502
503 assert(real->map_count != 0 && "too many unmaps");
504 if (p_atomic_dec_zero(&real->map_count)) {
505 assert(!real->cpu_ptr &&
506 "too many unmaps or forgot RADEON_MAP_TEMPORARY flag");
507
508 if (real->b.base.placement & RADEON_DOMAIN_VRAM)
509 aws->mapped_vram -= real->b.base.size;
510 else if (real->b.base.placement & RADEON_DOMAIN_GTT)
511 aws->mapped_gtt -= real->b.base.size;
512 aws->num_mapped_buffers--;
513 }
514
515 assert(aws->dev);
516 ac_drm_bo_cpu_unmap(aws->dev, real->bo);
517 }
518
amdgpu_add_buffer_to_global_list(struct amdgpu_winsys * aws,struct amdgpu_bo_real * bo)519 static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys *aws, struct amdgpu_bo_real *bo)
520 {
521 #if MESA_DEBUG
522 if (aws->debug_all_bos) {
523 simple_mtx_lock(&aws->global_bo_list_lock);
524 list_addtail(&bo->global_list_item, &aws->global_bo_list);
525 aws->num_buffers++;
526 simple_mtx_unlock(&aws->global_bo_list_lock);
527 }
528 #endif
529 }
530
amdgpu_get_optimal_alignment(struct amdgpu_winsys * aws,uint64_t size,unsigned alignment)531 static unsigned amdgpu_get_optimal_alignment(struct amdgpu_winsys *aws,
532 uint64_t size, unsigned alignment)
533 {
534 /* Increase the alignment for faster address translation and better memory
535 * access pattern.
536 */
537 if (size >= aws->info.pte_fragment_size) {
538 alignment = MAX2(alignment, aws->info.pte_fragment_size);
539 } else if (size) {
540 unsigned msb = util_last_bit(size);
541
542 alignment = MAX2(alignment, 1u << (msb - 1));
543 }
544 return alignment;
545 }
546
amdgpu_create_bo(struct amdgpu_winsys * aws,uint64_t size,unsigned alignment,enum radeon_bo_domain initial_domain,unsigned flags,int heap)547 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *aws,
548 uint64_t size,
549 unsigned alignment,
550 enum radeon_bo_domain initial_domain,
551 unsigned flags,
552 int heap)
553 {
554 struct amdgpu_bo_alloc_request request = {0};
555 ac_drm_bo buf_handle;
556 uint64_t va = 0;
557 struct amdgpu_bo_real *bo;
558 amdgpu_va_handle va_handle = NULL;
559 int r;
560
561 /* VRAM or GTT must be specified, but not both at the same time. */
562 assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT |
563 RADEON_DOMAIN_GDS |
564 RADEON_DOMAIN_OA |
565 RADEON_DOMAIN_DOORBELL)) == 1);
566
567 alignment = amdgpu_get_optimal_alignment(aws, size, alignment);
568
569 if (heap >= 0 && flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) {
570 struct amdgpu_bo_real_reusable *new_bo;
571 bool slab_backing = flags & RADEON_FLAG_WINSYS_SLAB_BACKING;
572
573 if (slab_backing)
574 new_bo = (struct amdgpu_bo_real_reusable *)CALLOC_STRUCT(amdgpu_bo_real_reusable_slab);
575 else
576 new_bo = CALLOC_STRUCT(amdgpu_bo_real_reusable);
577
578 if (!new_bo)
579 return NULL;
580
581 bo = &new_bo->b;
582 pb_cache_init_entry(&aws->bo_cache, &new_bo->cache_entry, &bo->b.base, heap);
583 bo->b.type = slab_backing ? AMDGPU_BO_REAL_REUSABLE_SLAB : AMDGPU_BO_REAL_REUSABLE;
584 } else {
585 bo = CALLOC_STRUCT(amdgpu_bo_real);
586 if (!bo)
587 return NULL;
588
589 bo->b.type = AMDGPU_BO_REAL;
590 }
591
592 request.alloc_size = size;
593 request.phys_alignment = alignment;
594
595 if (initial_domain & RADEON_DOMAIN_VRAM) {
596 request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
597
598 /* Since VRAM and GTT have almost the same performance on APUs, we could
599 * just set GTT. However, in order to decrease GTT(RAM) usage, which is
600 * shared with the OS, allow VRAM placements too. The idea is not to use
601 * VRAM usefully, but to use it so that it's not unused and wasted.
602 */
603 if (!aws->info.has_dedicated_vram)
604 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
605 }
606
607 if (initial_domain & RADEON_DOMAIN_GTT)
608 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
609 if (initial_domain & RADEON_DOMAIN_GDS)
610 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;
611 if (initial_domain & RADEON_DOMAIN_OA)
612 request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;
613 if (initial_domain & RADEON_DOMAIN_DOORBELL)
614 request.preferred_heap |= AMDGPU_GEM_DOMAIN_DOORBELL;
615
616 if (flags & RADEON_FLAG_NO_CPU_ACCESS)
617 request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
618 if (flags & RADEON_FLAG_GTT_WC)
619 request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
620
621 if (aws->info.has_local_buffers &&
622 initial_domain & (RADEON_DOMAIN_VRAM_GTT | RADEON_DOMAIN_DOORBELL) &&
623 flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
624 request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
625
626 if (flags & RADEON_FLAG_DISCARDABLE &&
627 aws->info.drm_minor >= 47)
628 request.flags |= AMDGPU_GEM_CREATE_DISCARDABLE;
629
630 if ((flags & RADEON_FLAG_CLEAR_VRAM) || (aws->zero_all_vram_allocs &&
631 (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)))
632 request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
633
634 if ((flags & RADEON_FLAG_ENCRYPTED) &&
635 aws->info.has_tmz_support) {
636 request.flags |= AMDGPU_GEM_CREATE_ENCRYPTED;
637
638 if (!(flags & RADEON_FLAG_DRIVER_INTERNAL)) {
639 struct amdgpu_screen_winsys *sws_iter;
640 simple_mtx_lock(&aws->sws_list_lock);
641 for (sws_iter = aws->sws_list; sws_iter; sws_iter = sws_iter->next) {
642 *((bool*) &sws_iter->base.uses_secure_bos) = true;
643 }
644 simple_mtx_unlock(&aws->sws_list_lock);
645 }
646 }
647
648 if (flags & RADEON_FLAG_GFX12_ALLOW_DCC)
649 request.flags |= AMDGPU_GEM_CREATE_GFX12_DCC;
650
651 /* Set AMDGPU_GEM_CREATE_VIRTIO_SHARED if the driver didn't disable buffer sharing. */
652 if (aws->info.is_virtio && (initial_domain & RADEON_DOMAIN_VRAM_GTT) &&
653 (flags & (RADEON_FLAG_DRIVER_INTERNAL | RADEON_FLAG_NO_INTERPROCESS_SHARING)) == 0)
654 request.flags |= AMDGPU_GEM_CREATE_VIRTIO_SHARED;
655
656 r = ac_drm_bo_alloc(aws->dev, &request, &buf_handle);
657 if (r) {
658 fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
659 fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size);
660 fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment);
661 fprintf(stderr, "amdgpu: domains : %u\n", initial_domain);
662 fprintf(stderr, "amdgpu: flags : %" PRIx64 "\n", request.flags);
663 goto error_bo_alloc;
664 }
665
666 uint32_t kms_handle = 0;
667 ac_drm_bo_export(aws->dev, buf_handle, amdgpu_bo_handle_type_kms, &kms_handle);
668
669 if (initial_domain & RADEON_DOMAIN_VRAM_GTT) {
670 unsigned va_gap_size = aws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
671
672 r = ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general,
673 size + va_gap_size, alignment,
674 0, &va, &va_handle,
675 (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
676 AMDGPU_VA_RANGE_HIGH);
677 if (r)
678 goto error_va_alloc;
679
680 unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
681 AMDGPU_VM_PAGE_WRITEABLE |
682 AMDGPU_VM_PAGE_EXECUTABLE;
683
684 if (flags & RADEON_FLAG_GL2_BYPASS)
685 vm_flags |= AMDGPU_VM_MTYPE_UC;
686
687 r = amdgpu_bo_va_op_common(aws, NULL, kms_handle, false, &bo->vm_timeline_point, 0,
688 size, va, vm_flags, AMDGPU_VA_OP_MAP);
689 if (r)
690 goto error_va_map;
691 }
692
693 simple_mtx_init(&bo->map_lock, mtx_plain);
694 pipe_reference_init(&bo->b.base.reference, 1);
695 bo->b.base.placement = initial_domain;
696 bo->b.base.alignment_log2 = util_logbase2(alignment);
697 bo->b.base.usage = flags;
698 bo->b.base.size = size;
699 bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
700 bo->bo = buf_handle;
701 bo->va_handle = va_handle;
702 bo->kms_handle = kms_handle;
703
704 if (initial_domain & RADEON_DOMAIN_VRAM)
705 aws->allocated_vram += align64(size, aws->info.gart_page_size);
706 else if (initial_domain & RADEON_DOMAIN_GTT)
707 aws->allocated_gtt += align64(size, aws->info.gart_page_size);
708
709 amdgpu_add_buffer_to_global_list(aws, bo);
710
711 return &bo->b;
712
713 error_va_map:
714 ac_drm_va_range_free(va_handle);
715
716 error_va_alloc:
717 ac_drm_bo_free(aws->dev, buf_handle);
718
719 error_bo_alloc:
720 FREE(bo);
721 return NULL;
722 }
723
amdgpu_bo_can_reclaim(struct amdgpu_winsys * aws,struct pb_buffer_lean * _buf)724 bool amdgpu_bo_can_reclaim(struct amdgpu_winsys *aws, struct pb_buffer_lean *_buf)
725 {
726 return amdgpu_bo_wait(&aws->dummy_sws.base, _buf, 0, RADEON_USAGE_READWRITE);
727 }
728
amdgpu_bo_can_reclaim_slab(void * priv,struct pb_slab_entry * entry)729 bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
730 {
731 struct amdgpu_bo_slab_entry *bo = container_of(entry, struct amdgpu_bo_slab_entry, entry);
732
733 return amdgpu_bo_can_reclaim(priv, &bo->b.base);
734 }
735
get_slab_wasted_size(struct amdgpu_winsys * aws,struct amdgpu_bo_slab_entry * bo)736 static unsigned get_slab_wasted_size(struct amdgpu_winsys *aws, struct amdgpu_bo_slab_entry *bo)
737 {
738 assert(bo->b.base.size <= bo->entry.slab->entry_size);
739 assert(bo->b.base.size < (1 << bo->b.base.alignment_log2) ||
740 bo->b.base.size < 1 << aws->bo_slabs.min_order ||
741 bo->b.base.size > bo->entry.slab->entry_size / 2);
742 return bo->entry.slab->entry_size - bo->b.base.size;
743 }
744
amdgpu_bo_slab_destroy(struct radeon_winsys * rws,struct pb_buffer_lean * _buf)745 static void amdgpu_bo_slab_destroy(struct radeon_winsys *rws, struct pb_buffer_lean *_buf)
746 {
747 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
748 struct amdgpu_bo_slab_entry *bo = get_slab_entry_bo(amdgpu_winsys_bo(_buf));
749
750 if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
751 aws->slab_wasted_vram -= get_slab_wasted_size(aws, bo);
752 else
753 aws->slab_wasted_gtt -= get_slab_wasted_size(aws, bo);
754
755 pb_slab_free(&aws->bo_slabs, &bo->entry);
756 }
757
758 /* Return the power of two size of a slab entry matching the input size. */
get_slab_pot_entry_size(struct amdgpu_winsys * aws,unsigned size)759 static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *aws, unsigned size)
760 {
761 unsigned entry_size = util_next_power_of_two(size);
762 unsigned min_entry_size = 1 << aws->bo_slabs.min_order;
763
764 return MAX2(entry_size, min_entry_size);
765 }
766
767 /* Return the slab entry alignment. */
get_slab_entry_alignment(struct amdgpu_winsys * aws,unsigned size)768 static unsigned get_slab_entry_alignment(struct amdgpu_winsys *aws, unsigned size)
769 {
770 unsigned entry_size = get_slab_pot_entry_size(aws, size);
771
772 if (size <= entry_size * 3 / 4)
773 return entry_size / 4;
774
775 return entry_size;
776 }
777
amdgpu_bo_slab_alloc(void * priv,unsigned heap,unsigned entry_size,unsigned group_index)778 struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_size,
779 unsigned group_index)
780 {
781 struct amdgpu_winsys *aws = priv;
782 enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
783 enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
784
785 /* Determine the slab buffer size. */
786 unsigned max_entry_size = 1 << (aws->bo_slabs.min_order + aws->bo_slabs.num_orders - 1);
787
788 assert(entry_size <= max_entry_size);
789
790 /* The slab size is twice the size of the largest possible entry. */
791 unsigned slab_size = max_entry_size * 2;
792
793 if (!util_is_power_of_two_nonzero(entry_size)) {
794 assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
795
796 /* If the entry size is 3/4 of a power of two, we would waste space and not gain
797 * anything if we allocated only twice the power of two for the backing buffer:
798 * 2 * 3/4 = 1.5 usable with buffer size 2
799 *
800 * Allocating 5 times the entry size leads us to the next power of two and results
801 * in a much better memory utilization:
802 * 5 * 3/4 = 3.75 usable with buffer size 4
803 */
804 if (entry_size * 5 > slab_size)
805 slab_size = util_next_power_of_two(entry_size * 5);
806 }
807
808 /* The largest slab should have the same size as the PTE fragment
809 * size to get faster address translation.
810 */
811 slab_size = MAX2(slab_size, aws->info.pte_fragment_size);
812
813 flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING |
814 RADEON_FLAG_NO_SUBALLOC |
815 RADEON_FLAG_WINSYS_SLAB_BACKING;
816
817 struct amdgpu_bo_real_reusable_slab *slab_bo =
818 (struct amdgpu_bo_real_reusable_slab*)amdgpu_bo_create(aws, slab_size, slab_size,
819 domains, flags);
820 if (!slab_bo)
821 return NULL;
822
823 /* The slab is not suballocated. */
824 assert(is_real_bo(&slab_bo->b.b.b));
825 assert(slab_bo->b.b.b.type == AMDGPU_BO_REAL_REUSABLE_SLAB);
826
827 /* We can get a buffer from pb_cache that is slightly larger. */
828 slab_size = slab_bo->b.b.b.base.size;
829
830 slab_bo->slab.num_entries = slab_size / entry_size;
831 slab_bo->slab.num_free = slab_bo->slab.num_entries;
832 slab_bo->slab.group_index = group_index;
833 slab_bo->slab.entry_size = entry_size;
834 slab_bo->entries = os_malloc_aligned(slab_bo->slab.num_entries * sizeof(*slab_bo->entries),
835 CACHE_LINE_SIZE);
836 if (!slab_bo->entries)
837 goto fail;
838
839 memset(slab_bo->entries, 0, slab_bo->slab.num_entries * sizeof(*slab_bo->entries));
840 list_inithead(&slab_bo->slab.free);
841
842 for (unsigned i = 0; i < slab_bo->slab.num_entries; ++i) {
843 struct amdgpu_bo_slab_entry *bo = &slab_bo->entries[i];
844
845 bo->b.base.placement = domains;
846 bo->b.base.alignment_log2 = util_logbase2(get_slab_entry_alignment(aws, entry_size));
847 bo->b.base.size = entry_size;
848 bo->b.type = AMDGPU_BO_SLAB_ENTRY;
849
850 bo->entry.slab = &slab_bo->slab;
851 list_addtail(&bo->entry.head, &slab_bo->slab.free);
852 }
853
854 /* Wasted alignment due to slabs with 3/4 allocations being aligned to a power of two. */
855 assert(slab_bo->slab.num_entries * entry_size <= slab_size);
856 if (domains & RADEON_DOMAIN_VRAM)
857 aws->slab_wasted_vram += slab_size - slab_bo->slab.num_entries * entry_size;
858 else
859 aws->slab_wasted_gtt += slab_size - slab_bo->slab.num_entries * entry_size;
860
861 return &slab_bo->slab;
862
863 fail:
864 amdgpu_winsys_bo_reference(aws, (struct amdgpu_winsys_bo**)&slab_bo, NULL);
865 return NULL;
866 }
867
amdgpu_bo_slab_free(struct amdgpu_winsys * aws,struct pb_slab * slab)868 void amdgpu_bo_slab_free(struct amdgpu_winsys *aws, struct pb_slab *slab)
869 {
870 struct amdgpu_bo_real_reusable_slab *bo = get_bo_from_slab(slab);
871 unsigned slab_size = bo->b.b.b.base.size;
872
873 assert(bo->slab.num_entries * bo->slab.entry_size <= slab_size);
874 if (bo->b.b.b.base.placement & RADEON_DOMAIN_VRAM)
875 aws->slab_wasted_vram -= slab_size - bo->slab.num_entries * bo->slab.entry_size;
876 else
877 aws->slab_wasted_gtt -= slab_size - bo->slab.num_entries * bo->slab.entry_size;
878
879 for (unsigned i = 0; i < bo->slab.num_entries; ++i)
880 amdgpu_bo_remove_fences(&bo->entries[i].b);
881
882 os_free_aligned(bo->entries);
883 amdgpu_winsys_bo_reference(aws, (struct amdgpu_winsys_bo**)&bo, NULL);
884 }
885
886 #if DEBUG_SPARSE_COMMITS
887 static void
sparse_dump(struct amdgpu_bo_sparse * bo,const char * func)888 sparse_dump(struct amdgpu_bo_sparse *bo, const char *func)
889 {
890 fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n"
891 "Commitments:\n",
892 __func__, bo, bo->b.base.size, bo->num_va_pages, func);
893
894 struct amdgpu_sparse_backing *span_backing = NULL;
895 uint32_t span_first_backing_page = 0;
896 uint32_t span_first_va_page = 0;
897 uint32_t va_page = 0;
898
899 for (;;) {
900 struct amdgpu_sparse_backing *backing = 0;
901 uint32_t backing_page = 0;
902
903 if (va_page < bo->num_va_pages) {
904 backing = bo->commitments[va_page].backing;
905 backing_page = bo->commitments[va_page].page;
906 }
907
908 if (span_backing &&
909 (backing != span_backing ||
910 backing_page != span_first_backing_page + (va_page - span_first_va_page))) {
911 fprintf(stderr, " %u..%u: backing=%p:%u..%u\n",
912 span_first_va_page, va_page - 1, span_backing,
913 span_first_backing_page,
914 span_first_backing_page + (va_page - span_first_va_page) - 1);
915
916 span_backing = NULL;
917 }
918
919 if (va_page >= bo->num_va_pages)
920 break;
921
922 if (backing && !span_backing) {
923 span_backing = backing;
924 span_first_backing_page = backing_page;
925 span_first_va_page = va_page;
926 }
927
928 va_page++;
929 }
930
931 fprintf(stderr, "Backing:\n");
932
933 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->backing, list) {
934 fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->b.base.size);
935 for (unsigned i = 0; i < backing->num_chunks; ++i)
936 fprintf(stderr, " %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end);
937 }
938 }
939 #endif
940
941 /*
942 * Attempt to allocate the given number of backing pages. Fewer pages may be
943 * allocated (depending on the fragmentation of existing backing buffers),
944 * which will be reflected by a change to *pnum_pages.
945 */
946 static struct amdgpu_sparse_backing *
sparse_backing_alloc(struct amdgpu_winsys * aws,struct amdgpu_bo_sparse * bo,uint32_t * pstart_page,uint32_t * pnum_pages)947 sparse_backing_alloc(struct amdgpu_winsys *aws, struct amdgpu_bo_sparse *bo,
948 uint32_t *pstart_page, uint32_t *pnum_pages)
949 {
950 struct amdgpu_sparse_backing *best_backing;
951 unsigned best_idx;
952 uint32_t best_num_pages;
953
954 best_backing = NULL;
955 best_idx = 0;
956 best_num_pages = 0;
957
958 /* This is a very simple and inefficient best-fit algorithm. */
959 list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->backing, list) {
960 for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {
961 uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;
962 if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||
963 (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {
964 best_backing = backing;
965 best_idx = idx;
966 best_num_pages = cur_num_pages;
967 }
968 }
969 }
970
971 /* Allocate a new backing buffer if necessary. */
972 if (!best_backing) {
973 struct pb_buffer_lean *buf;
974 uint64_t size;
975 uint32_t pages;
976
977 best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);
978 if (!best_backing)
979 return NULL;
980
981 best_backing->max_chunks = 4;
982 best_backing->chunks = CALLOC(best_backing->max_chunks,
983 sizeof(*best_backing->chunks));
984 if (!best_backing->chunks) {
985 FREE(best_backing);
986 return NULL;
987 }
988
989 assert(bo->num_backing_pages < DIV_ROUND_UP(bo->b.base.size, RADEON_SPARSE_PAGE_SIZE));
990
991 size = MIN3(bo->b.base.size / 16,
992 8 * 1024 * 1024,
993 bo->b.base.size - (uint64_t)bo->num_backing_pages * RADEON_SPARSE_PAGE_SIZE);
994 size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);
995
996 buf = amdgpu_bo_create(aws, size, RADEON_SPARSE_PAGE_SIZE,
997 bo->b.base.placement,
998 (bo->b.base.usage & ~RADEON_FLAG_SPARSE &
999 /* Set the interprocess sharing flag to disable pb_cache because
1000 * amdgpu_bo_wait doesn't wait for active CS jobs.
1001 */
1002 ~RADEON_FLAG_NO_INTERPROCESS_SHARING) | RADEON_FLAG_NO_SUBALLOC);
1003 if (!buf) {
1004 FREE(best_backing->chunks);
1005 FREE(best_backing);
1006 return NULL;
1007 }
1008
1009 /* We might have gotten a bigger buffer than requested via caching. */
1010 pages = buf->size / RADEON_SPARSE_PAGE_SIZE;
1011
1012 best_backing->bo = get_real_bo(amdgpu_winsys_bo(buf));
1013 best_backing->num_chunks = 1;
1014 best_backing->chunks[0].begin = 0;
1015 best_backing->chunks[0].end = pages;
1016
1017 list_add(&best_backing->list, &bo->backing);
1018 bo->num_backing_pages += pages;
1019
1020 best_idx = 0;
1021 best_num_pages = pages;
1022 }
1023
1024 *pnum_pages = MIN2(*pnum_pages, best_num_pages);
1025 *pstart_page = best_backing->chunks[best_idx].begin;
1026 best_backing->chunks[best_idx].begin += *pnum_pages;
1027
1028 if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {
1029 memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],
1030 sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));
1031 best_backing->num_chunks--;
1032 }
1033
1034 return best_backing;
1035 }
1036
1037 static void
sparse_free_backing_buffer(struct amdgpu_winsys * aws,struct amdgpu_bo_sparse * bo,struct amdgpu_sparse_backing * backing)1038 sparse_free_backing_buffer(struct amdgpu_winsys *aws, struct amdgpu_bo_sparse *bo,
1039 struct amdgpu_sparse_backing *backing)
1040 {
1041 bo->num_backing_pages -= backing->bo->b.base.size / RADEON_SPARSE_PAGE_SIZE;
1042
1043 /* Add fences from bo to backing->bo. */
1044 simple_mtx_lock(&aws->bo_fence_lock);
1045 u_foreach_bit(i, bo->b.fences.valid_fence_mask) {
1046 add_seq_no_to_list(aws, &backing->bo->b.fences, i, bo->b.fences.seq_no[i]);
1047 }
1048 simple_mtx_unlock(&aws->bo_fence_lock);
1049
1050 list_del(&backing->list);
1051 amdgpu_winsys_bo_reference(aws, (struct amdgpu_winsys_bo**)&backing->bo, NULL);
1052 FREE(backing->chunks);
1053 FREE(backing);
1054 }
1055
1056 /*
1057 * Return a range of pages from the given backing buffer back into the
1058 * free structure.
1059 */
1060 static bool
sparse_backing_free(struct amdgpu_winsys * aws,struct amdgpu_bo_sparse * bo,struct amdgpu_sparse_backing * backing,uint32_t start_page,uint32_t num_pages)1061 sparse_backing_free(struct amdgpu_winsys *aws, struct amdgpu_bo_sparse *bo,
1062 struct amdgpu_sparse_backing *backing,
1063 uint32_t start_page, uint32_t num_pages)
1064 {
1065 uint32_t end_page = start_page + num_pages;
1066 unsigned low = 0;
1067 unsigned high = backing->num_chunks;
1068
1069 /* Find the first chunk with begin >= start_page. */
1070 while (low < high) {
1071 unsigned mid = low + (high - low) / 2;
1072
1073 if (backing->chunks[mid].begin >= start_page)
1074 high = mid;
1075 else
1076 low = mid + 1;
1077 }
1078
1079 assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);
1080 assert(low == 0 || backing->chunks[low - 1].end <= start_page);
1081
1082 if (low > 0 && backing->chunks[low - 1].end == start_page) {
1083 backing->chunks[low - 1].end = end_page;
1084
1085 if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
1086 backing->chunks[low - 1].end = backing->chunks[low].end;
1087 memmove(&backing->chunks[low], &backing->chunks[low + 1],
1088 sizeof(*backing->chunks) * (backing->num_chunks - low - 1));
1089 backing->num_chunks--;
1090 }
1091 } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
1092 backing->chunks[low].begin = start_page;
1093 } else {
1094 if (backing->num_chunks >= backing->max_chunks) {
1095 unsigned new_max_chunks = 2 * backing->max_chunks;
1096 struct amdgpu_sparse_backing_chunk *new_chunks =
1097 REALLOC(backing->chunks,
1098 sizeof(*backing->chunks) * backing->max_chunks,
1099 sizeof(*backing->chunks) * new_max_chunks);
1100 if (!new_chunks)
1101 return false;
1102
1103 backing->max_chunks = new_max_chunks;
1104 backing->chunks = new_chunks;
1105 }
1106
1107 memmove(&backing->chunks[low + 1], &backing->chunks[low],
1108 sizeof(*backing->chunks) * (backing->num_chunks - low));
1109 backing->chunks[low].begin = start_page;
1110 backing->chunks[low].end = end_page;
1111 backing->num_chunks++;
1112 }
1113
1114 if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&
1115 backing->chunks[0].end == backing->bo->b.base.size / RADEON_SPARSE_PAGE_SIZE)
1116 sparse_free_backing_buffer(aws, bo, backing);
1117
1118 return true;
1119 }
1120
amdgpu_bo_sparse_destroy(struct radeon_winsys * rws,struct pb_buffer_lean * _buf)1121 static void amdgpu_bo_sparse_destroy(struct radeon_winsys *rws, struct pb_buffer_lean *_buf)
1122 {
1123 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1124 struct amdgpu_bo_sparse *bo = get_sparse_bo(amdgpu_winsys_bo(_buf));
1125 int r;
1126
1127 r = amdgpu_bo_va_op_common(aws, amdgpu_winsys_bo(_buf), 0, true, NULL, 0,
1128 (uint64_t)bo->num_va_pages * RADEON_SPARSE_PAGE_SIZE,
1129 amdgpu_va_get_start_addr(bo->va_handle), 0, AMDGPU_VA_OP_CLEAR);
1130 if (r) {
1131 fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);
1132 }
1133
1134 while (!list_is_empty(&bo->backing)) {
1135 sparse_free_backing_buffer(aws, bo,
1136 container_of(bo->backing.next,
1137 struct amdgpu_sparse_backing, list));
1138 }
1139
1140 ac_drm_va_range_free(bo->va_handle);
1141 FREE(bo->commitments);
1142 simple_mtx_destroy(&bo->commit_lock);
1143 FREE(bo);
1144 }
1145
1146 static struct pb_buffer_lean *
amdgpu_bo_sparse_create(struct amdgpu_winsys * aws,uint64_t size,enum radeon_bo_domain domain,enum radeon_bo_flag flags)1147 amdgpu_bo_sparse_create(struct amdgpu_winsys *aws, uint64_t size,
1148 enum radeon_bo_domain domain,
1149 enum radeon_bo_flag flags)
1150 {
1151 struct amdgpu_bo_sparse *bo;
1152 uint64_t map_size;
1153 uint64_t va_gap_size;
1154 int r;
1155
1156 /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers
1157 * that exceed this limit. This is not really a restriction: we don't have
1158 * that much virtual address space anyway.
1159 */
1160 if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)
1161 return NULL;
1162
1163 bo = CALLOC_STRUCT(amdgpu_bo_sparse);
1164 if (!bo)
1165 return NULL;
1166
1167 simple_mtx_init(&bo->commit_lock, mtx_plain);
1168 pipe_reference_init(&bo->b.base.reference, 1);
1169 bo->b.base.placement = domain;
1170 bo->b.base.alignment_log2 = util_logbase2(RADEON_SPARSE_PAGE_SIZE);
1171 bo->b.base.usage = flags;
1172 bo->b.base.size = size;
1173 bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
1174 bo->b.type = AMDGPU_BO_SPARSE;
1175
1176 bo->num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1177 bo->commitments = CALLOC(bo->num_va_pages, sizeof(*bo->commitments));
1178 if (!bo->commitments)
1179 goto error_alloc_commitments;
1180
1181 list_inithead(&bo->backing);
1182
1183 /* For simplicity, we always map a multiple of the page size. */
1184 map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);
1185 va_gap_size = aws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;
1186
1187 uint64_t gpu_address;
1188 r = ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general,
1189 map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,
1190 0, &gpu_address, &bo->va_handle, AMDGPU_VA_RANGE_HIGH);
1191 if (r)
1192 goto error_va_alloc;
1193
1194 r = amdgpu_bo_va_op_common(aws, NULL, 0, false, &bo->vm_timeline_point, 0, map_size,
1195 gpu_address, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
1196 if (r)
1197 goto error_va_map;
1198
1199 return &bo->b.base;
1200
1201 error_va_map:
1202 ac_drm_va_range_free(bo->va_handle);
1203 error_va_alloc:
1204 FREE(bo->commitments);
1205 error_alloc_commitments:
1206 simple_mtx_destroy(&bo->commit_lock);
1207 FREE(bo);
1208 return NULL;
1209 }
1210
1211 static bool
amdgpu_bo_sparse_commit(struct radeon_winsys * rws,struct pb_buffer_lean * buf,uint64_t offset,uint64_t size,bool commit)1212 amdgpu_bo_sparse_commit(struct radeon_winsys *rws, struct pb_buffer_lean *buf,
1213 uint64_t offset, uint64_t size, bool commit)
1214 {
1215 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1216 struct amdgpu_bo_sparse *bo = get_sparse_bo(amdgpu_winsys_bo(buf));
1217 struct amdgpu_sparse_commitment *comm;
1218 uint32_t va_page, end_va_page;
1219 bool ok = true;
1220 int r;
1221
1222 assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);
1223 assert(offset <= bo->b.base.size);
1224 assert(size <= bo->b.base.size - offset);
1225 assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->b.base.size);
1226
1227 comm = bo->commitments;
1228 va_page = offset / RADEON_SPARSE_PAGE_SIZE;
1229 end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1230
1231 simple_mtx_lock(&bo->commit_lock);
1232
1233 #if DEBUG_SPARSE_COMMITS
1234 sparse_dump(bo, __func__);
1235 #endif
1236
1237 if (commit) {
1238 while (va_page < end_va_page) {
1239 uint32_t span_va_page;
1240
1241 /* Skip pages that are already committed. */
1242 if (comm[va_page].backing) {
1243 va_page++;
1244 continue;
1245 }
1246
1247 /* Determine length of uncommitted span. */
1248 span_va_page = va_page;
1249 while (va_page < end_va_page && !comm[va_page].backing)
1250 va_page++;
1251
1252 /* Fill the uncommitted span with chunks of backing memory. */
1253 while (span_va_page < va_page) {
1254 struct amdgpu_sparse_backing *backing;
1255 uint32_t backing_start, backing_size;
1256
1257 backing_size = va_page - span_va_page;
1258 backing = sparse_backing_alloc(aws, bo, &backing_start, &backing_size);
1259 if (!backing) {
1260 ok = false;
1261 goto out;
1262 }
1263
1264 r = amdgpu_bo_va_op_common(aws, amdgpu_winsys_bo(buf), backing->bo->kms_handle,
1265 true, &bo->vm_timeline_point,
1266 (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,
1267 (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,
1268 amdgpu_va_get_start_addr(bo->va_handle) +
1269 (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,
1270 AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
1271 AMDGPU_VM_PAGE_EXECUTABLE, AMDGPU_VA_OP_REPLACE);
1272 if (r) {
1273 ok = sparse_backing_free(aws, bo, backing, backing_start, backing_size);
1274 assert(ok && "sufficient memory should already be allocated");
1275
1276 ok = false;
1277 goto out;
1278 }
1279
1280 while (backing_size) {
1281 comm[span_va_page].backing = backing;
1282 comm[span_va_page].page = backing_start;
1283 span_va_page++;
1284 backing_start++;
1285 backing_size--;
1286 }
1287 }
1288 }
1289 } else {
1290 r = amdgpu_bo_va_op_common(aws, amdgpu_winsys_bo(buf), 0, true, &bo->vm_timeline_point,
1291 0, (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,
1292 amdgpu_va_get_start_addr(bo->va_handle) +
1293 (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,
1294 AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
1295 if (r) {
1296 ok = false;
1297 goto out;
1298 }
1299
1300 while (va_page < end_va_page) {
1301 struct amdgpu_sparse_backing *backing;
1302 uint32_t backing_start;
1303 uint32_t span_pages;
1304
1305 /* Skip pages that are already uncommitted. */
1306 if (!comm[va_page].backing) {
1307 va_page++;
1308 continue;
1309 }
1310
1311 /* Group contiguous spans of pages. */
1312 backing = comm[va_page].backing;
1313 backing_start = comm[va_page].page;
1314 comm[va_page].backing = NULL;
1315
1316 span_pages = 1;
1317 va_page++;
1318
1319 while (va_page < end_va_page &&
1320 comm[va_page].backing == backing &&
1321 comm[va_page].page == backing_start + span_pages) {
1322 comm[va_page].backing = NULL;
1323 va_page++;
1324 span_pages++;
1325 }
1326
1327 if (!sparse_backing_free(aws, bo, backing, backing_start, span_pages)) {
1328 /* Couldn't allocate tracking data structures, so we have to leak */
1329 fprintf(stderr, "amdgpu: leaking PRT backing memory\n");
1330 ok = false;
1331 }
1332 }
1333 }
1334 out:
1335
1336 simple_mtx_unlock(&bo->commit_lock);
1337
1338 return ok;
1339 }
1340
1341 static unsigned
amdgpu_bo_find_next_committed_memory(struct pb_buffer_lean * buf,uint64_t range_offset,unsigned * range_size)1342 amdgpu_bo_find_next_committed_memory(struct pb_buffer_lean *buf,
1343 uint64_t range_offset, unsigned *range_size)
1344 {
1345 struct amdgpu_bo_sparse *bo = get_sparse_bo(amdgpu_winsys_bo(buf));
1346 struct amdgpu_sparse_commitment *comm;
1347 uint32_t va_page, end_va_page;
1348 uint32_t span_va_page, start_va_page;
1349 unsigned uncommitted_range_prev, uncommitted_range_next;
1350
1351 if (*range_size == 0)
1352 return 0;
1353
1354 assert(*range_size + range_offset <= bo->b.base.size);
1355
1356 uncommitted_range_prev = uncommitted_range_next = 0;
1357 comm = bo->commitments;
1358 start_va_page = va_page = range_offset / RADEON_SPARSE_PAGE_SIZE;
1359 end_va_page = (*range_size + range_offset) / RADEON_SPARSE_PAGE_SIZE;
1360
1361 simple_mtx_lock(&bo->commit_lock);
1362 /* Lookup the first committed page with backing physical storage */
1363 while (va_page < end_va_page && !comm[va_page].backing)
1364 va_page++;
1365
1366 /* Fisrt committed page lookup failed, return early. */
1367 if (va_page == end_va_page && !comm[va_page].backing) {
1368 uncommitted_range_prev = *range_size;
1369 *range_size = 0;
1370 simple_mtx_unlock(&bo->commit_lock);
1371 return uncommitted_range_prev;
1372 }
1373
1374 /* Lookup the first uncommitted page without backing physical storage */
1375 span_va_page = va_page;
1376 while (va_page < end_va_page && comm[va_page].backing)
1377 va_page++;
1378 simple_mtx_unlock(&bo->commit_lock);
1379
1380 /* Calc byte count that need to skip before committed range */
1381 if (span_va_page != start_va_page)
1382 uncommitted_range_prev = span_va_page * RADEON_SPARSE_PAGE_SIZE - range_offset;
1383
1384 /* Calc byte count that need to skip after committed range */
1385 if (va_page != end_va_page || !comm[va_page].backing) {
1386 uncommitted_range_next = *range_size + range_offset - va_page * RADEON_SPARSE_PAGE_SIZE;
1387 }
1388
1389 /* Calc size of first committed part */
1390 *range_size = *range_size - uncommitted_range_next - uncommitted_range_prev;
1391 return *range_size ? uncommitted_range_prev : uncommitted_range_prev + uncommitted_range_next;
1392 }
1393
amdgpu_buffer_get_metadata(struct radeon_winsys * rws,struct pb_buffer_lean * _buf,struct radeon_bo_metadata * md,struct radeon_surf * surf)1394 static void amdgpu_buffer_get_metadata(struct radeon_winsys *rws,
1395 struct pb_buffer_lean *_buf,
1396 struct radeon_bo_metadata *md,
1397 struct radeon_surf *surf)
1398 {
1399 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1400 struct amdgpu_bo_real *bo = get_real_bo(amdgpu_winsys_bo(_buf));
1401 struct amdgpu_bo_info info = {0};
1402 uint32_t md_version, md_flags;
1403 enum amd_gfx_level gfx_level = aws->info.gfx_level;
1404 int r;
1405
1406 r = ac_drm_bo_query_info(aws->dev, bo->kms_handle, &info);
1407 if (r)
1408 return;
1409
1410 md->size_metadata = info.metadata.size_metadata;
1411 memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
1412
1413 md_version = md->metadata[0] & 0xffff;
1414 if (md_version >= 3 && md->size_metadata > 4) {
1415 md_flags = md->metadata[0] >> 16;
1416 if (md_flags & (1u << AC_SURF_METADATA_FLAG_FAMILY_OVERRIDEN_BIT)) {
1417 /* The overriden gfx_level is always the last dword. */
1418 gfx_level = md->metadata[md->size_metadata / 4 - 1];
1419
1420 /* Fallback to the default value if the value we got is incorrect. */
1421 if (gfx_level < GFX6 || gfx_level >= NUM_GFX_VERSIONS)
1422 gfx_level = aws->info.gfx_level;
1423 }
1424 }
1425
1426 ac_surface_apply_bo_metadata(gfx_level, surf, info.metadata.tiling_info,
1427 &md->mode);
1428 }
1429
amdgpu_buffer_set_metadata(struct radeon_winsys * rws,struct pb_buffer_lean * _buf,struct radeon_bo_metadata * md,struct radeon_surf * surf)1430 static void amdgpu_buffer_set_metadata(struct radeon_winsys *rws,
1431 struct pb_buffer_lean *_buf,
1432 struct radeon_bo_metadata *md,
1433 struct radeon_surf *surf)
1434 {
1435 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1436 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1437 struct amdgpu_bo_real *real = is_real_bo(bo) ? get_real_bo(bo) : get_slab_entry_real_bo(bo);
1438 struct amdgpu_bo_metadata metadata = {0};
1439
1440 ac_surface_compute_bo_metadata(&aws->info, surf, &metadata.tiling_info);
1441
1442 metadata.size_metadata = md->size_metadata;
1443 memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
1444
1445 ac_drm_bo_set_metadata(aws->dev, real->kms_handle, &metadata);
1446 }
1447
1448 struct pb_buffer_lean *
amdgpu_bo_create(struct amdgpu_winsys * aws,uint64_t size,unsigned alignment,enum radeon_bo_domain domain,enum radeon_bo_flag flags)1449 amdgpu_bo_create(struct amdgpu_winsys *aws,
1450 uint64_t size,
1451 unsigned alignment,
1452 enum radeon_bo_domain domain,
1453 enum radeon_bo_flag flags)
1454 {
1455 struct amdgpu_winsys_bo *bo;
1456
1457 radeon_canonicalize_bo_flags(&domain, &flags);
1458
1459 /* Handle sparse buffers first. */
1460 if (flags & RADEON_FLAG_SPARSE) {
1461 assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
1462
1463 return amdgpu_bo_sparse_create(aws, size, domain, flags);
1464 }
1465
1466 unsigned max_slab_entry_size = 1 << (aws->bo_slabs.min_order + aws->bo_slabs.num_orders - 1);
1467 int heap = radeon_get_heap_index(domain, flags);
1468
1469 /* Sub-allocate small buffers from slabs. */
1470 if (heap >= 0 && size <= max_slab_entry_size) {
1471 struct pb_slab_entry *entry;
1472 unsigned alloc_size = size;
1473
1474 /* Always use slabs for sizes less than 4 KB because the kernel aligns
1475 * everything to 4 KB.
1476 */
1477 if (size < alignment && alignment <= 4 * 1024)
1478 alloc_size = alignment;
1479
1480 if (alignment > get_slab_entry_alignment(aws, alloc_size)) {
1481 /* 3/4 allocations can return too small alignment. Try again with a power of two
1482 * allocation size.
1483 */
1484 unsigned pot_size = get_slab_pot_entry_size(aws, alloc_size);
1485
1486 if (alignment <= pot_size) {
1487 /* This size works but wastes some memory to fulfil the alignment. */
1488 alloc_size = pot_size;
1489 } else {
1490 goto no_slab; /* can't fulfil alignment requirements */
1491 }
1492 }
1493
1494 entry = pb_slab_alloc(&aws->bo_slabs, alloc_size, heap);
1495 if (!entry) {
1496 /* Clean up buffer managers and try again. */
1497 amdgpu_clean_up_buffer_managers(aws);
1498
1499 entry = pb_slab_alloc(&aws->bo_slabs, alloc_size, heap);
1500 }
1501 if (!entry)
1502 return NULL;
1503
1504 struct amdgpu_bo_slab_entry *slab_bo = container_of(entry, struct amdgpu_bo_slab_entry, entry);
1505 pipe_reference_init(&slab_bo->b.base.reference, 1);
1506 slab_bo->b.base.size = size;
1507 slab_bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
1508 assert(alignment <= 1 << slab_bo->b.base.alignment_log2);
1509
1510 if (domain & RADEON_DOMAIN_VRAM)
1511 aws->slab_wasted_vram += get_slab_wasted_size(aws, slab_bo);
1512 else
1513 aws->slab_wasted_gtt += get_slab_wasted_size(aws, slab_bo);
1514
1515 return &slab_bo->b.base;
1516 }
1517 no_slab:
1518
1519 /* Align size to page size. This is the minimum alignment for normal
1520 * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
1521 * like constant/uniform buffers, can benefit from better and more reuse.
1522 */
1523 if (domain & RADEON_DOMAIN_VRAM_GTT) {
1524 size = align64(size, aws->info.gart_page_size);
1525 alignment = align(alignment, aws->info.gart_page_size);
1526 }
1527
1528 bool use_reusable_pool = !(domain & RADEON_DOMAIN_DOORBELL) &&
1529 (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) &&
1530 !(flags & (RADEON_FLAG_DISCARDABLE | RADEON_FLAG_CLEAR_VRAM));
1531
1532 if (use_reusable_pool) {
1533 /* RADEON_FLAG_NO_SUBALLOC is irrelevant for the cache. */
1534 heap = radeon_get_heap_index(domain, flags & ~RADEON_FLAG_NO_SUBALLOC);
1535 assert(heap >= 0 && heap < RADEON_NUM_HEAPS);
1536
1537 /* Get a buffer from the cache. */
1538 bo = (struct amdgpu_winsys_bo*)
1539 pb_cache_reclaim_buffer(&aws->bo_cache, size, alignment, 0, heap);
1540 if (bo) {
1541 /* If the buffer is amdgpu_bo_real_reusable, but we need amdgpu_bo_real_reusable_slab,
1542 * keep the allocation but make the structure bigger.
1543 */
1544 if (flags & RADEON_FLAG_WINSYS_SLAB_BACKING && bo->type == AMDGPU_BO_REAL_REUSABLE) {
1545 const unsigned orig_size = sizeof(struct amdgpu_bo_real_reusable);
1546 const unsigned new_size = sizeof(struct amdgpu_bo_real_reusable_slab);
1547 struct amdgpu_winsys_bo *new_bo =
1548 (struct amdgpu_winsys_bo*)REALLOC(bo, orig_size, new_size);
1549
1550 if (!new_bo) {
1551 amdgpu_winsys_bo_reference(aws, &bo, NULL);
1552 return NULL;
1553 }
1554
1555 memset((uint8_t*)new_bo + orig_size, 0, new_size - orig_size);
1556 bo = new_bo;
1557 bo->type = AMDGPU_BO_REAL_REUSABLE_SLAB;
1558 }
1559 return &bo->base;
1560 }
1561 }
1562
1563 /* Create a new one. */
1564 bo = amdgpu_create_bo(aws, size, alignment, domain, flags, heap);
1565 if (!bo) {
1566 /* Clean up buffer managers and try again. */
1567 amdgpu_clean_up_buffer_managers(aws);
1568
1569 bo = amdgpu_create_bo(aws, size, alignment, domain, flags, heap);
1570 if (!bo)
1571 return NULL;
1572 }
1573
1574 return &bo->base;
1575 }
1576
1577 static struct pb_buffer_lean *
amdgpu_buffer_create(struct radeon_winsys * rws,uint64_t size,unsigned alignment,enum radeon_bo_domain domain,enum radeon_bo_flag flags)1578 amdgpu_buffer_create(struct radeon_winsys *rws,
1579 uint64_t size,
1580 unsigned alignment,
1581 enum radeon_bo_domain domain,
1582 enum radeon_bo_flag flags)
1583 {
1584 struct pb_buffer_lean * res = amdgpu_bo_create(amdgpu_winsys(rws), size, alignment, domain,
1585 flags);
1586 return res;
1587 }
1588
amdgpu_bo_from_handle(struct radeon_winsys * rws,struct winsys_handle * whandle,unsigned vm_alignment,bool is_prime_linear_buffer)1589 static struct pb_buffer_lean *amdgpu_bo_from_handle(struct radeon_winsys *rws,
1590 struct winsys_handle *whandle,
1591 unsigned vm_alignment,
1592 bool is_prime_linear_buffer)
1593 {
1594 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1595 struct amdgpu_bo_real *bo = NULL;
1596 enum amdgpu_bo_handle_type type;
1597 struct ac_drm_bo_import_result result = {0};
1598 uint64_t va;
1599 amdgpu_va_handle va_handle = NULL;
1600 struct amdgpu_bo_info info = {0};
1601 enum radeon_bo_domain initial = 0;
1602 enum radeon_bo_flag flags = 0;
1603 int r;
1604
1605 switch (whandle->type) {
1606 case WINSYS_HANDLE_TYPE_SHARED:
1607 assert(!aws->info.is_virtio); /* Legacy-path, not handled */
1608 type = amdgpu_bo_handle_type_gem_flink_name;
1609 break;
1610 case WINSYS_HANDLE_TYPE_FD:
1611 type = amdgpu_bo_handle_type_dma_buf_fd;
1612 break;
1613 default:
1614 return NULL;
1615 }
1616
1617 r = ac_drm_bo_import(aws->dev, type, whandle->handle, &result);
1618 if (r)
1619 return NULL;
1620
1621 simple_mtx_lock(&aws->bo_export_table_lock);
1622 bo = util_hash_table_get(aws->bo_export_table, result.bo.abo);
1623
1624 /* If the amdgpu_winsys_bo instance already exists, bump the reference
1625 * counter and return it.
1626 */
1627 if (bo) {
1628 p_atomic_inc(&bo->b.base.reference.count);
1629 simple_mtx_unlock(&aws->bo_export_table_lock);
1630
1631 /* Release the buffer handle, because we don't need it anymore.
1632 * This function is returning an existing buffer, which has its own
1633 * handle.
1634 */
1635 ac_drm_bo_free(aws->dev, result.bo);
1636 return &bo->b.base;
1637 }
1638
1639 uint32_t kms_handle;
1640 ac_drm_bo_export(aws->dev, result.bo, amdgpu_bo_handle_type_kms, &kms_handle);
1641
1642 /* Get initial domains. */
1643 r = ac_drm_bo_query_info(aws->dev, kms_handle, &info);
1644 if (r)
1645 goto error;
1646
1647 r = ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general,
1648 result.alloc_size,
1649 amdgpu_get_optimal_alignment(aws, result.alloc_size,
1650 vm_alignment),
1651 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH);
1652 if (r)
1653 goto error;
1654
1655 bo = CALLOC_STRUCT(amdgpu_bo_real);
1656 if (!bo)
1657 goto error;
1658
1659 r = amdgpu_bo_va_op_common(aws, NULL, kms_handle, false, &bo->vm_timeline_point, 0,
1660 result.alloc_size, va, AMDGPU_VM_PAGE_READABLE |
1661 AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE |
1662 (is_prime_linear_buffer ? AMDGPU_VM_MTYPE_UC : 0),
1663 AMDGPU_VA_OP_MAP);
1664 if (r)
1665 goto error;
1666
1667 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
1668 initial |= RADEON_DOMAIN_VRAM;
1669 if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
1670 initial |= RADEON_DOMAIN_GTT;
1671 if (info.alloc_flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)
1672 flags |= RADEON_FLAG_NO_CPU_ACCESS;
1673 if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
1674 flags |= RADEON_FLAG_GTT_WC;
1675 if (info.alloc_flags & AMDGPU_GEM_CREATE_ENCRYPTED) {
1676 /* Imports are always possible even if the importer isn't using TMZ.
1677 * For instance libweston needs to import the buffer to be able to determine
1678 * if it can be used for scanout.
1679 */
1680 flags |= RADEON_FLAG_ENCRYPTED;
1681 *((bool*)&rws->uses_secure_bos) = true;
1682 }
1683 if (info.alloc_flags & AMDGPU_GEM_CREATE_GFX12_DCC)
1684 flags |= RADEON_FLAG_GFX12_ALLOW_DCC;
1685
1686 /* Initialize the structure. */
1687 pipe_reference_init(&bo->b.base.reference, 1);
1688 bo->b.base.placement = initial;
1689 bo->b.base.alignment_log2 = util_logbase2(info.phys_alignment ?
1690 info.phys_alignment : aws->info.gart_page_size);
1691 bo->b.base.usage = flags;
1692 bo->b.base.size = result.alloc_size;
1693 bo->b.type = AMDGPU_BO_REAL;
1694 bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
1695 simple_mtx_init(&bo->map_lock, mtx_plain);
1696 bo->bo = result.bo;
1697 bo->va_handle = va_handle;
1698 bo->kms_handle = kms_handle;
1699 bo->is_shared = true;
1700
1701 if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
1702 aws->allocated_vram += align64(bo->b.base.size, aws->info.gart_page_size);
1703 else if (bo->b.base.placement & RADEON_DOMAIN_GTT)
1704 aws->allocated_gtt += align64(bo->b.base.size, aws->info.gart_page_size);
1705
1706 amdgpu_add_buffer_to_global_list(aws, bo);
1707
1708 _mesa_hash_table_insert(aws->bo_export_table, bo->bo.abo, bo);
1709 simple_mtx_unlock(&aws->bo_export_table_lock);
1710
1711 return &bo->b.base;
1712
1713 error:
1714 simple_mtx_unlock(&aws->bo_export_table_lock);
1715 if (bo)
1716 FREE(bo);
1717 if (va_handle)
1718 ac_drm_va_range_free(va_handle);
1719 ac_drm_bo_free(aws->dev, result.bo);
1720 return NULL;
1721 }
1722
amdgpu_bo_get_handle(struct radeon_winsys * rws,struct pb_buffer_lean * buffer,struct winsys_handle * whandle)1723 static bool amdgpu_bo_get_handle(struct radeon_winsys *rws,
1724 struct pb_buffer_lean *buffer,
1725 struct winsys_handle *whandle)
1726 {
1727 struct amdgpu_screen_winsys *sws = amdgpu_screen_winsys(rws);
1728 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1729 enum amdgpu_bo_handle_type type;
1730 struct hash_entry *entry;
1731 int r;
1732
1733 /* Don't allow exports of slab entries and sparse buffers. */
1734 if (!is_real_bo(amdgpu_winsys_bo(buffer)))
1735 return false;
1736
1737 struct amdgpu_bo_real *bo = get_real_bo(amdgpu_winsys_bo(buffer));
1738
1739 /* This removes the REUSABLE enum if it's set. */
1740 bo->b.type = AMDGPU_BO_REAL;
1741
1742 switch (whandle->type) {
1743 case WINSYS_HANDLE_TYPE_SHARED:
1744 /* This is a legacy code-path, not supported by virtio. */
1745 assert(!aws->info.is_virtio);
1746 type = amdgpu_bo_handle_type_gem_flink_name;
1747 break;
1748 case WINSYS_HANDLE_TYPE_KMS:
1749 if (sws->fd == aws->fd) {
1750 /* For virtio we can't return kms_handle, because it's not a GEM handle,
1751 * but a resource ID. Instead, repurpose the deprecated type
1752 * amdgpu_bo_handle_type_kms_noimport to request a GEM handle.
1753 */
1754 if (aws->info.is_virtio)
1755 ac_drm_bo_export(aws->dev, bo->bo,
1756 amdgpu_bo_handle_type_kms_noimport,
1757 &whandle->handle);
1758 else
1759 whandle->handle = bo->kms_handle;
1760
1761 if (bo->is_shared)
1762 return true;
1763
1764 goto hash_table_set;
1765 }
1766
1767 simple_mtx_lock(&aws->sws_list_lock);
1768 entry = _mesa_hash_table_search(sws->kms_handles, bo);
1769 simple_mtx_unlock(&aws->sws_list_lock);
1770 if (entry) {
1771 whandle->handle = (uintptr_t)entry->data;
1772 return true;
1773 }
1774 FALLTHROUGH;
1775 case WINSYS_HANDLE_TYPE_FD:
1776 type = amdgpu_bo_handle_type_dma_buf_fd;
1777 break;
1778 default:
1779 return false;
1780 }
1781
1782 r = ac_drm_bo_export(aws->dev, bo->bo, type, &whandle->handle);
1783 if (r)
1784 return false;
1785
1786 #if defined(DMA_BUF_SET_NAME_B)
1787 if (whandle->type == WINSYS_HANDLE_TYPE_FD &&
1788 !bo->is_shared) {
1789 char dmabufname[32];
1790 snprintf(dmabufname, 32, "%d-%s", getpid(), util_get_process_name());
1791 r = ioctl(whandle->handle, DMA_BUF_SET_NAME_B, (uint64_t)(uintptr_t)dmabufname);
1792 }
1793 #endif
1794
1795 if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
1796 int dma_fd = whandle->handle;
1797
1798 r = drmPrimeFDToHandle(sws->fd, dma_fd, &whandle->handle);
1799 close(dma_fd);
1800
1801 if (r)
1802 return false;
1803
1804 simple_mtx_lock(&aws->sws_list_lock);
1805 _mesa_hash_table_insert_pre_hashed(sws->kms_handles,
1806 bo->kms_handle, bo,
1807 (void*)(uintptr_t)whandle->handle);
1808 simple_mtx_unlock(&aws->sws_list_lock);
1809 }
1810
1811 hash_table_set:
1812 simple_mtx_lock(&aws->bo_export_table_lock);
1813 _mesa_hash_table_insert(aws->bo_export_table, bo->bo.abo, bo);
1814 simple_mtx_unlock(&aws->bo_export_table_lock);
1815
1816 bo->is_shared = true;
1817 return true;
1818 }
1819
amdgpu_bo_from_ptr(struct radeon_winsys * rws,void * pointer,uint64_t size,enum radeon_bo_flag flags)1820 static struct pb_buffer_lean *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
1821 void *pointer, uint64_t size,
1822 enum radeon_bo_flag flags)
1823 {
1824 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1825 ac_drm_bo buf_handle;
1826 struct amdgpu_bo_real *bo;
1827 uint64_t va;
1828 amdgpu_va_handle va_handle;
1829 /* Avoid failure when the size is not page aligned */
1830 uint64_t aligned_size = align64(size, aws->info.gart_page_size);
1831
1832 bo = CALLOC_STRUCT(amdgpu_bo_real);
1833 if (!bo)
1834 return NULL;
1835
1836 if (ac_drm_create_bo_from_user_mem(aws->dev, pointer,
1837 aligned_size, &buf_handle))
1838 goto error;
1839
1840 if (ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general,
1841 aligned_size,
1842 amdgpu_get_optimal_alignment(aws, aligned_size,
1843 aws->info.gart_page_size),
1844 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH))
1845 goto error_va_alloc;
1846
1847 uint32_t kms_handle;
1848 ac_drm_bo_export(aws->dev, buf_handle, amdgpu_bo_handle_type_kms, &kms_handle);
1849
1850 if (amdgpu_bo_va_op_common(aws, NULL, kms_handle, false, &bo->vm_timeline_point, 0,
1851 aligned_size, va, AMDGPU_VM_PAGE_READABLE |
1852 AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
1853 AMDGPU_VA_OP_MAP))
1854 goto error_va_map;
1855
1856 /* Initialize it. */
1857 bo->is_user_ptr = true;
1858 pipe_reference_init(&bo->b.base.reference, 1);
1859 bo->b.base.placement = RADEON_DOMAIN_GTT;
1860 bo->b.base.alignment_log2 = 0;
1861 bo->b.base.size = size;
1862 bo->b.type = AMDGPU_BO_REAL;
1863 bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
1864 simple_mtx_init(&bo->map_lock, mtx_plain);
1865 bo->bo = buf_handle;
1866 bo->cpu_ptr = pointer;
1867 bo->va_handle = va_handle;
1868 bo->kms_handle = kms_handle;
1869
1870 aws->allocated_gtt += aligned_size;
1871
1872 amdgpu_add_buffer_to_global_list(aws, bo);
1873
1874 return (struct pb_buffer_lean*)bo;
1875
1876 error_va_map:
1877 ac_drm_va_range_free(va_handle);
1878
1879 error_va_alloc:
1880 ac_drm_bo_free(aws->dev, buf_handle);
1881
1882 error:
1883 FREE(bo);
1884 return NULL;
1885 }
1886
amdgpu_bo_is_user_ptr(struct pb_buffer_lean * buf)1887 static bool amdgpu_bo_is_user_ptr(struct pb_buffer_lean *buf)
1888 {
1889 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
1890
1891 return is_real_bo(bo) ? get_real_bo(bo)->is_user_ptr : false;
1892 }
1893
amdgpu_bo_is_suballocated(struct pb_buffer_lean * buf)1894 static bool amdgpu_bo_is_suballocated(struct pb_buffer_lean *buf)
1895 {
1896 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
1897
1898 return bo->type == AMDGPU_BO_SLAB_ENTRY;
1899 }
1900
amdgpu_bo_get_va(struct pb_buffer_lean * buf)1901 uint64_t amdgpu_bo_get_va(struct pb_buffer_lean *buf)
1902 {
1903 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
1904
1905 if (bo->type == AMDGPU_BO_SLAB_ENTRY) {
1906 struct amdgpu_bo_real_reusable_slab *slab_bo =
1907 (struct amdgpu_bo_real_reusable_slab *)get_slab_entry_real_bo(bo);
1908
1909 return amdgpu_va_get_start_addr(slab_bo->b.b.va_handle) + get_slab_entry_offset(bo);
1910 } else if (bo->type == AMDGPU_BO_SPARSE) {
1911 return amdgpu_va_get_start_addr(get_sparse_bo(bo)->va_handle);
1912 } else {
1913 return amdgpu_va_get_start_addr(get_real_bo(bo)->va_handle);
1914 }
1915 }
1916
amdgpu_buffer_destroy(struct radeon_winsys * rws,struct pb_buffer_lean * buf)1917 static void amdgpu_buffer_destroy(struct radeon_winsys *rws, struct pb_buffer_lean *buf)
1918 {
1919 struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
1920
1921 if (bo->type == AMDGPU_BO_SLAB_ENTRY)
1922 amdgpu_bo_slab_destroy(rws, buf);
1923 else if (bo->type == AMDGPU_BO_SPARSE)
1924 amdgpu_bo_sparse_destroy(rws, buf);
1925 else
1926 amdgpu_bo_destroy_or_cache(rws, buf);
1927 }
1928
amdgpu_bo_init_functions(struct amdgpu_screen_winsys * sws)1929 void amdgpu_bo_init_functions(struct amdgpu_screen_winsys *sws)
1930 {
1931 sws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
1932 sws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
1933 sws->base.buffer_map = amdgpu_bo_map;
1934 sws->base.buffer_unmap = amdgpu_bo_unmap;
1935 sws->base.buffer_wait = amdgpu_bo_wait;
1936 sws->base.buffer_create = amdgpu_buffer_create;
1937 sws->base.buffer_destroy = amdgpu_buffer_destroy;
1938 sws->base.buffer_from_handle = amdgpu_bo_from_handle;
1939 sws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
1940 sws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
1941 sws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;
1942 sws->base.buffer_get_handle = amdgpu_bo_get_handle;
1943 sws->base.buffer_commit = amdgpu_bo_sparse_commit;
1944 sws->base.buffer_find_next_committed_memory = amdgpu_bo_find_next_committed_memory;
1945 sws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
1946 sws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
1947 sws->base.buffer_get_flags = amdgpu_bo_get_flags;
1948 }
1949