• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
3  * Copyright © 2015 Advanced Micro Devices, Inc.
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include <sys/ioctl.h>
9 
10 #include "amdgpu_cs.h"
11 
12 #include "util/os_drm.h"
13 #include "util/hash_table.h"
14 #include "util/os_time.h"
15 #include "util/u_hash_table.h"
16 #include "util/u_process.h"
17 #include "frontend/drm_driver.h"
18 #include "drm-uapi/amdgpu_drm.h"
19 #include "drm-uapi/dma-buf.h"
20 #include "sid.h"
21 #include <xf86drm.h>
22 #include <stdio.h>
23 #include <inttypes.h>
24 
25 #ifndef AMDGPU_VA_RANGE_HIGH
26 #define AMDGPU_VA_RANGE_HIGH	0x2
27 #endif
28 
29 /* Set to 1 for verbose output showing committed sparse buffer ranges. */
30 #define DEBUG_SPARSE_COMMITS 0
31 
32 struct amdgpu_sparse_backing_chunk {
33    uint32_t begin, end;
34 };
35 
amdgpu_bo_fence_wait(struct amdgpu_winsys * aws,struct pipe_fence_handle ** fence,uint64_t timeout,int64_t abs_timeout)36 static bool amdgpu_bo_fence_wait(struct amdgpu_winsys *aws,
37                                  struct pipe_fence_handle **fence,
38                                  uint64_t timeout, int64_t abs_timeout)
39 {
40    if (timeout == 0) {
41       bool idle = amdgpu_fence_wait(*fence, 0, false);
42 
43       if (!idle) {
44          simple_mtx_unlock(&aws->bo_fence_lock);
45          return false; /* busy */
46       }
47 
48       /* It's idle. Remove it from the ring to skip checking it again later. */
49       amdgpu_fence_reference(fence, NULL);
50    } else {
51       struct pipe_fence_handle *tmp_fence = NULL;
52       amdgpu_fence_reference(&tmp_fence, *fence);
53 
54       /* While waiting, unlock the mutex. */
55       simple_mtx_unlock(&aws->bo_fence_lock);
56 
57       bool idle = amdgpu_fence_wait(tmp_fence, abs_timeout, true);
58       if (!idle) {
59          amdgpu_fence_reference(&tmp_fence, NULL);
60          return false; /* busy */
61       }
62 
63       simple_mtx_lock(&aws->bo_fence_lock);
64       /* It's idle. Remove it from the ring to skip checking it again later. */
65       if (tmp_fence == *fence)
66          amdgpu_fence_reference(fence, NULL);
67       amdgpu_fence_reference(&tmp_fence, NULL);
68    }
69 
70    return true;
71 }
72 
amdgpu_bo_wait(struct radeon_winsys * rws,struct pb_buffer_lean * _buf,uint64_t timeout,unsigned usage)73 static bool amdgpu_bo_wait(struct radeon_winsys *rws,
74                            struct pb_buffer_lean *_buf, uint64_t timeout,
75                            unsigned usage)
76 {
77    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
78    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
79    int64_t abs_timeout = 0;
80 
81    assert(p_atomic_read(&bo->num_active_ioctls) >= 0);
82 
83    if (timeout == 0) {
84       if (p_atomic_read(&bo->num_active_ioctls))
85          return false;
86 
87    } else {
88       abs_timeout = os_time_get_absolute_timeout(timeout);
89 
90       /* Wait if any ioctl is being submitted with this buffer. */
91       if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
92          return false;
93    }
94 
95    if (is_real_bo(bo) && (get_real_bo(bo)->is_shared || get_real_bo(bo)->slab_has_busy_alt_fences)) {
96       /* We can't use user fences for shared buffers, because user fences are local to this
97        * process only. If we want to wait for all buffer uses in all processes, we have to
98        * use amdgpu_bo_wait_for_idle.
99        *
100        * Additionally, if this is a slab buffer and one of the slab entries has non-NULL
101        * alt_fence, we can't easily wait for that here. Instead, use the kernel ioctl to wait
102        * for the buffer.
103        */
104       bool buffer_busy = true;
105       int r;
106 
107       /* The GEM_WAIT_IDLE ioctl with timeout=0 can take up to 1 ms to return. This is a kernel
108        * inefficiency. This flag indicates whether it's better to return busy than wait for 1 ms.
109        */
110       if (timeout == 0 && usage & RADEON_USAGE_DISALLOW_SLOW_REPLY)
111          return false;
112 
113       r = ac_drm_bo_wait_for_idle(aws->dev, get_real_bo(bo)->bo, timeout, &buffer_busy);
114       if (r)
115          fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__, r);
116 
117       if (!buffer_busy)
118          get_real_bo(bo)->slab_has_busy_alt_fences = false;
119       return !buffer_busy;
120    }
121 
122    simple_mtx_lock(&aws->bo_fence_lock);
123 
124    u_foreach_bit(i, bo->fences.valid_fence_mask) {
125       struct pipe_fence_handle **fence = get_fence_from_ring(aws, &bo->fences, i);
126 
127       if (fence) {
128          /* This also unlocks the mutex on failure. */
129          if (!amdgpu_bo_fence_wait(aws, fence, timeout, abs_timeout))
130             return false;
131       }
132 
133       bo->fences.valid_fence_mask &= ~BITFIELD_BIT(i); /* remove the fence from the BO */
134    }
135 
136    /* Also wait for alt_fence. */
137    if (bo->alt_fence) {
138       /* This also unlocks the mutex on failure. */
139       if (!amdgpu_bo_fence_wait(aws, &bo->alt_fence, timeout, abs_timeout))
140          return false;
141    }
142 
143    simple_mtx_unlock(&aws->bo_fence_lock);
144    return true; /* idle */
145 }
146 
amdgpu_bo_get_syncobjs(struct amdgpu_winsys * aws,struct amdgpu_winsys_bo * bo,uint32_t * syncobj,uint32_t * num_fences)147 static void amdgpu_bo_get_syncobjs(struct amdgpu_winsys *aws, struct amdgpu_winsys_bo *bo,
148                                    uint32_t *syncobj, uint32_t *num_fences)
149 {
150    if (p_atomic_read(&bo->num_active_ioctls))
151       os_wait_until_zero(&bo->num_active_ioctls, OS_TIMEOUT_INFINITE);
152 
153    simple_mtx_lock(&aws->bo_fence_lock);
154    u_foreach_bit(queue_index, bo->fences.valid_fence_mask) {
155       struct pipe_fence_handle **fence = get_fence_from_ring(aws, &bo->fences, queue_index);
156       if (fence) {
157          if (!amdgpu_fence_wait(*fence, 0, 0)) {
158             syncobj[(*num_fences)++] = ((struct amdgpu_fence*)*fence)->syncobj;
159          } else {
160             amdgpu_fence_reference(fence, NULL);
161             /* remove the fence from the BO */
162             bo->fences.valid_fence_mask &= ~BITFIELD_BIT(queue_index);
163          }
164       }
165    }
166 
167    if (bo->alt_fence) {
168       if (!amdgpu_fence_wait(bo->alt_fence, 0, 0))
169          syncobj[(*num_fences)++] = ((struct amdgpu_fence*)bo->alt_fence)->syncobj;
170       else
171          amdgpu_fence_reference(&bo->alt_fence, NULL);
172    }
173    simple_mtx_unlock(&aws->bo_fence_lock);
174 }
175 
amdgpu_bo_va_op_common(struct amdgpu_winsys * aws,struct amdgpu_winsys_bo * bo,uint32_t bo_handle,bool send_input_fence,uint64_t * vm_timeline_point,uint64_t offset,uint64_t size,uint64_t addr,uint64_t flags,uint32_t ops)176 static int amdgpu_bo_va_op_common(struct amdgpu_winsys *aws, struct amdgpu_winsys_bo *bo,
177                                   uint32_t bo_handle, bool send_input_fence,
178                                   uint64_t *vm_timeline_point, uint64_t offset, uint64_t size,
179                                   uint64_t addr, uint64_t flags, uint32_t ops)
180 {
181    int r;
182 
183    if (aws->info.use_userq) {
184       uint32_t syncobj_arr[AMDGPU_MAX_QUEUES + 1];
185       uint32_t num_fences = 0;
186 
187       if (send_input_fence)
188          amdgpu_bo_get_syncobjs(aws, bo, &syncobj_arr[0], &num_fences);
189 
190       /* The lock guarantees that the execution ordering of the vm ioctls match the timeline
191        * sequence number ordering.
192        */
193       simple_mtx_lock(&aws->vm_ioctl_lock);
194       aws->vm_timeline_seq_num++;
195       if (vm_timeline_point) {
196          /* Sparse buffers can be updated concurrently by another thread so we use atomic operation
197           * to get a valid seqno.
198           */
199          p_atomic_set(vm_timeline_point, aws->vm_timeline_seq_num);
200       }
201       r = ac_drm_bo_va_op_raw2(aws->dev, bo_handle, offset, size, addr, flags, ops,
202                                aws->vm_timeline_syncobj, aws->vm_timeline_seq_num,
203                                (uintptr_t)&syncobj_arr, num_fences);
204       simple_mtx_unlock(&aws->vm_ioctl_lock);
205    } else {
206       r = ac_drm_bo_va_op_raw(aws->dev, bo_handle, offset, size, addr, flags, ops);
207    }
208 
209    return r;
210 }
211 
get_slab_entry_offset(struct amdgpu_winsys_bo * bo)212 static inline unsigned get_slab_entry_offset(struct amdgpu_winsys_bo *bo)
213 {
214    struct amdgpu_bo_slab_entry *slab_entry_bo = get_slab_entry_bo(bo);
215    struct amdgpu_bo_real_reusable_slab *slab_bo =
216       (struct amdgpu_bo_real_reusable_slab *)get_slab_entry_real_bo(bo);
217    unsigned entry_index = slab_entry_bo - slab_bo->entries;
218 
219    return slab_bo->slab.entry_size * entry_index;
220 }
221 
amdgpu_bo_get_initial_domain(struct pb_buffer_lean * buf)222 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
223       struct pb_buffer_lean *buf)
224 {
225    return ((struct amdgpu_winsys_bo*)buf)->base.placement;
226 }
227 
amdgpu_bo_get_flags(struct pb_buffer_lean * buf)228 static enum radeon_bo_flag amdgpu_bo_get_flags(
229       struct pb_buffer_lean *buf)
230 {
231    return ((struct amdgpu_winsys_bo*)buf)->base.usage;
232 }
233 
amdgpu_bo_remove_fences(struct amdgpu_winsys_bo * bo)234 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
235 {
236    bo->fences.valid_fence_mask = 0;
237    amdgpu_fence_reference(&bo->alt_fence, NULL);
238 }
239 
amdgpu_bo_destroy(struct amdgpu_winsys * aws,struct pb_buffer_lean * _buf)240 void amdgpu_bo_destroy(struct amdgpu_winsys *aws, struct pb_buffer_lean *_buf)
241 {
242    struct amdgpu_bo_real *bo = get_real_bo(amdgpu_winsys_bo(_buf));
243    struct amdgpu_screen_winsys *sws_iter;
244 
245    simple_mtx_lock(&aws->bo_export_table_lock);
246 
247    /* amdgpu_bo_from_handle might have revived the bo */
248    if (p_atomic_read(&bo->b.base.reference.count)) {
249       simple_mtx_unlock(&aws->bo_export_table_lock);
250       return;
251    }
252 
253    _mesa_hash_table_remove_key(aws->bo_export_table, bo->bo.abo);
254 
255    if (bo->b.base.placement & RADEON_DOMAIN_VRAM_GTT) {
256       amdgpu_bo_va_op_common(aws, amdgpu_winsys_bo(_buf), bo->kms_handle, true, NULL, 0,
257                              bo->b.base.size, amdgpu_va_get_start_addr(bo->va_handle),
258                              AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
259                                 AMDGPU_VM_PAGE_EXECUTABLE, AMDGPU_VA_OP_UNMAP);
260       ac_drm_va_range_free(bo->va_handle);
261    }
262 
263    simple_mtx_unlock(&aws->bo_export_table_lock);
264 
265    if (!bo->is_user_ptr && bo->cpu_ptr) {
266       bo->cpu_ptr = NULL;
267       amdgpu_bo_unmap(&aws->dummy_sws.base, &bo->b.base);
268    }
269    assert(bo->is_user_ptr || bo->map_count == 0);
270 
271    ac_drm_bo_free(aws->dev, bo->bo);
272 
273 #if MESA_DEBUG
274    if (aws->debug_all_bos) {
275       simple_mtx_lock(&aws->global_bo_list_lock);
276       list_del(&bo->global_list_item);
277       aws->num_buffers--;
278       simple_mtx_unlock(&aws->global_bo_list_lock);
279    }
280 #endif
281 
282    /* Close all KMS handles retrieved for other DRM file descriptions */
283    simple_mtx_lock(&aws->sws_list_lock);
284    for (sws_iter = aws->sws_list; sws_iter; sws_iter = sws_iter->next) {
285       struct hash_entry *entry;
286 
287       if (!sws_iter->kms_handles)
288          continue;
289 
290       entry = _mesa_hash_table_search(sws_iter->kms_handles, bo);
291       if (entry) {
292          struct drm_gem_close args = { .handle = (uintptr_t)entry->data };
293 
294          drm_ioctl(sws_iter->fd, DRM_IOCTL_GEM_CLOSE, &args);
295          _mesa_hash_table_remove(sws_iter->kms_handles, entry);
296       }
297    }
298    simple_mtx_unlock(&aws->sws_list_lock);
299 
300    amdgpu_bo_remove_fences(&bo->b);
301 
302    if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
303       aws->allocated_vram -= align64(bo->b.base.size, aws->info.gart_page_size);
304    else if (bo->b.base.placement & RADEON_DOMAIN_GTT)
305       aws->allocated_gtt -= align64(bo->b.base.size, aws->info.gart_page_size);
306 
307    simple_mtx_destroy(&bo->map_lock);
308    FREE(bo);
309 }
310 
amdgpu_bo_destroy_or_cache(struct radeon_winsys * rws,struct pb_buffer_lean * _buf)311 static void amdgpu_bo_destroy_or_cache(struct radeon_winsys *rws, struct pb_buffer_lean *_buf)
312 {
313    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
314    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
315 
316    assert(is_real_bo(bo)); /* slab buffers have a separate vtbl */
317 
318    if (bo->type >= AMDGPU_BO_REAL_REUSABLE)
319       pb_cache_add_buffer(&aws->bo_cache, &((struct amdgpu_bo_real_reusable*)bo)->cache_entry);
320    else
321       amdgpu_bo_destroy(aws, _buf);
322 }
323 
amdgpu_clean_up_buffer_managers(struct amdgpu_winsys * aws)324 static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *aws)
325 {
326    pb_slabs_reclaim(&aws->bo_slabs);
327    pb_cache_release_all_buffers(&aws->bo_cache);
328 }
329 
amdgpu_bo_do_map(struct radeon_winsys * rws,struct amdgpu_bo_real * bo,void ** cpu)330 static bool amdgpu_bo_do_map(struct radeon_winsys *rws, struct amdgpu_bo_real *bo, void **cpu)
331 {
332    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
333 
334    assert(!bo->is_user_ptr);
335 
336    *cpu = NULL;
337    int r = ac_drm_bo_cpu_map(aws->dev, bo->bo, cpu);
338 
339    if (r) {
340       /* Clean up buffer managers and try again. */
341       amdgpu_clean_up_buffer_managers(aws);
342       r = ac_drm_bo_cpu_map(aws->dev, bo->bo, cpu);
343       if (r)
344          return false;
345    }
346 
347    if (p_atomic_inc_return(&bo->map_count) == 1) {
348       if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
349          aws->mapped_vram += bo->b.base.size;
350       else if (bo->b.base.placement & RADEON_DOMAIN_GTT)
351          aws->mapped_gtt += bo->b.base.size;
352       aws->num_mapped_buffers++;
353    }
354 
355    return true;
356 }
357 
amdgpu_bo_map(struct radeon_winsys * rws,struct pb_buffer_lean * buf,struct radeon_cmdbuf * rcs,enum pipe_map_flags usage)358 void *amdgpu_bo_map(struct radeon_winsys *rws,
359                     struct pb_buffer_lean *buf,
360                     struct radeon_cmdbuf *rcs,
361                     enum pipe_map_flags usage)
362 {
363    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
364    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
365    struct amdgpu_bo_real *real;
366    struct amdgpu_cs *cs = rcs ? amdgpu_cs(rcs) : NULL;
367 
368    assert(bo->type != AMDGPU_BO_SPARSE);
369 
370    /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
371    if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
372       /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
373       if (usage & PIPE_MAP_DONTBLOCK) {
374          if (!(usage & PIPE_MAP_WRITE)) {
375             /* Mapping for read.
376              *
377              * Since we are mapping for read, we don't need to wait
378              * if the GPU is using the buffer for read too
379              * (neither one is changing it).
380              *
381              * Only check whether the buffer is being used for write. */
382             if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
383                                                                RADEON_USAGE_WRITE)) {
384                cs->flush_cs(cs->flush_data,
385 			    RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
386                return NULL;
387             }
388 
389             if (!amdgpu_bo_wait(rws, (struct pb_buffer_lean*)bo, 0,
390                                 RADEON_USAGE_WRITE)) {
391                return NULL;
392             }
393          } else {
394             if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
395                cs->flush_cs(cs->flush_data,
396 			    RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
397                return NULL;
398             }
399 
400             if (!amdgpu_bo_wait(rws, (struct pb_buffer_lean*)bo, 0,
401                                 RADEON_USAGE_READWRITE)) {
402                return NULL;
403             }
404          }
405       } else {
406          uint64_t time = os_time_get_nano();
407 
408          if (!(usage & PIPE_MAP_WRITE)) {
409             /* Mapping for read.
410              *
411              * Since we are mapping for read, we don't need to wait
412              * if the GPU is using the buffer for read too
413              * (neither one is changing it).
414              *
415              * Only check whether the buffer is being used for write. */
416             if (cs) {
417                if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
418                                                             RADEON_USAGE_WRITE)) {
419                   cs->flush_cs(cs->flush_data,
420 			       RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
421                } else {
422                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
423                   if (p_atomic_read(&bo->num_active_ioctls))
424                      amdgpu_cs_sync_flush(rcs);
425                }
426             }
427 
428             amdgpu_bo_wait(rws, (struct pb_buffer_lean*)bo, OS_TIMEOUT_INFINITE,
429                            RADEON_USAGE_WRITE);
430          } else {
431             /* Mapping for write. */
432             if (cs) {
433                if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
434                   cs->flush_cs(cs->flush_data,
435 			       RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
436                } else {
437                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
438                   if (p_atomic_read(&bo->num_active_ioctls))
439                      amdgpu_cs_sync_flush(rcs);
440                }
441             }
442 
443             amdgpu_bo_wait(rws, (struct pb_buffer_lean*)bo, OS_TIMEOUT_INFINITE,
444                            RADEON_USAGE_READWRITE);
445          }
446 
447          aws->buffer_wait_time += os_time_get_nano() - time;
448       }
449    }
450 
451    /* Buffer synchronization has been checked, now actually map the buffer. */
452    void *cpu = NULL;
453    uint64_t offset = 0;
454 
455    if (is_real_bo(bo)) {
456       real = get_real_bo(bo);
457    } else {
458       real = get_slab_entry_real_bo(bo);
459       offset = get_slab_entry_offset(bo);
460    }
461 
462    if (usage & RADEON_MAP_TEMPORARY) {
463       if (real->is_user_ptr) {
464          cpu = real->cpu_ptr;
465       } else {
466          if (!amdgpu_bo_do_map(rws, real, &cpu))
467             return NULL;
468       }
469    } else {
470       cpu = p_atomic_read(&real->cpu_ptr);
471       if (!cpu) {
472          simple_mtx_lock(&real->map_lock);
473          /* Must re-check due to the possibility of a race. Re-check need not
474           * be atomic thanks to the lock. */
475          cpu = real->cpu_ptr;
476          if (!cpu) {
477             if (!amdgpu_bo_do_map(rws, real, &cpu)) {
478                simple_mtx_unlock(&real->map_lock);
479                return NULL;
480             }
481             p_atomic_set(&real->cpu_ptr, cpu);
482          }
483          simple_mtx_unlock(&real->map_lock);
484       }
485    }
486 
487    return (uint8_t*)cpu + offset;
488 }
489 
amdgpu_bo_unmap(struct radeon_winsys * rws,struct pb_buffer_lean * buf)490 void amdgpu_bo_unmap(struct radeon_winsys *rws, struct pb_buffer_lean *buf)
491 {
492    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
493    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
494    struct amdgpu_bo_real *real;
495 
496    assert(bo->type != AMDGPU_BO_SPARSE);
497 
498    real = is_real_bo(bo) ? get_real_bo(bo) : get_slab_entry_real_bo(bo);
499 
500    if (real->is_user_ptr)
501       return;
502 
503    assert(real->map_count != 0 && "too many unmaps");
504    if (p_atomic_dec_zero(&real->map_count)) {
505       assert(!real->cpu_ptr &&
506              "too many unmaps or forgot RADEON_MAP_TEMPORARY flag");
507 
508       if (real->b.base.placement & RADEON_DOMAIN_VRAM)
509          aws->mapped_vram -= real->b.base.size;
510       else if (real->b.base.placement & RADEON_DOMAIN_GTT)
511          aws->mapped_gtt -= real->b.base.size;
512       aws->num_mapped_buffers--;
513    }
514 
515    assert(aws->dev);
516    ac_drm_bo_cpu_unmap(aws->dev, real->bo);
517 }
518 
amdgpu_add_buffer_to_global_list(struct amdgpu_winsys * aws,struct amdgpu_bo_real * bo)519 static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys *aws, struct amdgpu_bo_real *bo)
520 {
521 #if MESA_DEBUG
522    if (aws->debug_all_bos) {
523       simple_mtx_lock(&aws->global_bo_list_lock);
524       list_addtail(&bo->global_list_item, &aws->global_bo_list);
525       aws->num_buffers++;
526       simple_mtx_unlock(&aws->global_bo_list_lock);
527    }
528 #endif
529 }
530 
amdgpu_get_optimal_alignment(struct amdgpu_winsys * aws,uint64_t size,unsigned alignment)531 static unsigned amdgpu_get_optimal_alignment(struct amdgpu_winsys *aws,
532                                              uint64_t size, unsigned alignment)
533 {
534    /* Increase the alignment for faster address translation and better memory
535     * access pattern.
536     */
537    if (size >= aws->info.pte_fragment_size) {
538       alignment = MAX2(alignment, aws->info.pte_fragment_size);
539    } else if (size) {
540       unsigned msb = util_last_bit(size);
541 
542       alignment = MAX2(alignment, 1u << (msb - 1));
543    }
544    return alignment;
545 }
546 
amdgpu_create_bo(struct amdgpu_winsys * aws,uint64_t size,unsigned alignment,enum radeon_bo_domain initial_domain,unsigned flags,int heap)547 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *aws,
548                                                  uint64_t size,
549                                                  unsigned alignment,
550                                                  enum radeon_bo_domain initial_domain,
551                                                  unsigned flags,
552                                                  int heap)
553 {
554    struct amdgpu_bo_alloc_request request = {0};
555    ac_drm_bo buf_handle;
556    uint64_t va = 0;
557    struct amdgpu_bo_real *bo;
558    amdgpu_va_handle va_handle = NULL;
559    int r;
560 
561    /* VRAM or GTT must be specified, but not both at the same time. */
562    assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT |
563                                           RADEON_DOMAIN_GDS |
564                                           RADEON_DOMAIN_OA |
565                                           RADEON_DOMAIN_DOORBELL)) == 1);
566 
567    alignment = amdgpu_get_optimal_alignment(aws, size, alignment);
568 
569    if (heap >= 0 && flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) {
570       struct amdgpu_bo_real_reusable *new_bo;
571       bool slab_backing = flags & RADEON_FLAG_WINSYS_SLAB_BACKING;
572 
573       if (slab_backing)
574          new_bo = (struct amdgpu_bo_real_reusable *)CALLOC_STRUCT(amdgpu_bo_real_reusable_slab);
575       else
576          new_bo = CALLOC_STRUCT(amdgpu_bo_real_reusable);
577 
578       if (!new_bo)
579          return NULL;
580 
581       bo = &new_bo->b;
582       pb_cache_init_entry(&aws->bo_cache, &new_bo->cache_entry, &bo->b.base, heap);
583       bo->b.type = slab_backing ? AMDGPU_BO_REAL_REUSABLE_SLAB : AMDGPU_BO_REAL_REUSABLE;
584    } else {
585       bo = CALLOC_STRUCT(amdgpu_bo_real);
586       if (!bo)
587          return NULL;
588 
589       bo->b.type = AMDGPU_BO_REAL;
590    }
591 
592    request.alloc_size = size;
593    request.phys_alignment = alignment;
594 
595    if (initial_domain & RADEON_DOMAIN_VRAM) {
596       request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
597 
598       /* Since VRAM and GTT have almost the same performance on APUs, we could
599        * just set GTT. However, in order to decrease GTT(RAM) usage, which is
600        * shared with the OS, allow VRAM placements too. The idea is not to use
601        * VRAM usefully, but to use it so that it's not unused and wasted.
602        */
603       if (!aws->info.has_dedicated_vram)
604          request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
605    }
606 
607    if (initial_domain & RADEON_DOMAIN_GTT)
608       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
609    if (initial_domain & RADEON_DOMAIN_GDS)
610       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;
611    if (initial_domain & RADEON_DOMAIN_OA)
612       request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;
613    if (initial_domain & RADEON_DOMAIN_DOORBELL)
614       request.preferred_heap |= AMDGPU_GEM_DOMAIN_DOORBELL;
615 
616    if (flags & RADEON_FLAG_NO_CPU_ACCESS)
617       request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
618    if (flags & RADEON_FLAG_GTT_WC)
619       request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
620 
621    if (aws->info.has_local_buffers &&
622        initial_domain & (RADEON_DOMAIN_VRAM_GTT | RADEON_DOMAIN_DOORBELL) &&
623        flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
624       request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
625 
626    if (flags & RADEON_FLAG_DISCARDABLE &&
627        aws->info.drm_minor >= 47)
628       request.flags |= AMDGPU_GEM_CREATE_DISCARDABLE;
629 
630    if ((flags & RADEON_FLAG_CLEAR_VRAM) || (aws->zero_all_vram_allocs &&
631         (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)))
632       request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
633 
634    if ((flags & RADEON_FLAG_ENCRYPTED) &&
635        aws->info.has_tmz_support) {
636       request.flags |= AMDGPU_GEM_CREATE_ENCRYPTED;
637 
638       if (!(flags & RADEON_FLAG_DRIVER_INTERNAL)) {
639          struct amdgpu_screen_winsys *sws_iter;
640          simple_mtx_lock(&aws->sws_list_lock);
641          for (sws_iter = aws->sws_list; sws_iter; sws_iter = sws_iter->next) {
642             *((bool*) &sws_iter->base.uses_secure_bos) = true;
643          }
644          simple_mtx_unlock(&aws->sws_list_lock);
645       }
646    }
647 
648    if (flags & RADEON_FLAG_GFX12_ALLOW_DCC)
649       request.flags |= AMDGPU_GEM_CREATE_GFX12_DCC;
650 
651    /* Set AMDGPU_GEM_CREATE_VIRTIO_SHARED if the driver didn't disable buffer sharing. */
652    if (aws->info.is_virtio && (initial_domain & RADEON_DOMAIN_VRAM_GTT) &&
653        (flags & (RADEON_FLAG_DRIVER_INTERNAL | RADEON_FLAG_NO_INTERPROCESS_SHARING)) == 0)
654       request.flags |= AMDGPU_GEM_CREATE_VIRTIO_SHARED;
655 
656    r = ac_drm_bo_alloc(aws->dev, &request, &buf_handle);
657    if (r) {
658       fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
659       fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
660       fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
661       fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
662       fprintf(stderr, "amdgpu:    flags   : %" PRIx64 "\n", request.flags);
663       goto error_bo_alloc;
664    }
665 
666    uint32_t kms_handle = 0;
667    ac_drm_bo_export(aws->dev, buf_handle, amdgpu_bo_handle_type_kms, &kms_handle);
668 
669    if (initial_domain & RADEON_DOMAIN_VRAM_GTT) {
670       unsigned va_gap_size = aws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
671 
672       r = ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general,
673                                 size + va_gap_size, alignment,
674                                 0, &va, &va_handle,
675                                 (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
676                                 AMDGPU_VA_RANGE_HIGH);
677       if (r)
678          goto error_va_alloc;
679 
680       unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
681                           AMDGPU_VM_PAGE_WRITEABLE |
682                           AMDGPU_VM_PAGE_EXECUTABLE;
683 
684       if (flags & RADEON_FLAG_GL2_BYPASS)
685          vm_flags |= AMDGPU_VM_MTYPE_UC;
686 
687       r = amdgpu_bo_va_op_common(aws, NULL, kms_handle, false, &bo->vm_timeline_point, 0,
688                                  size, va, vm_flags, AMDGPU_VA_OP_MAP);
689       if (r)
690          goto error_va_map;
691    }
692 
693    simple_mtx_init(&bo->map_lock, mtx_plain);
694    pipe_reference_init(&bo->b.base.reference, 1);
695    bo->b.base.placement = initial_domain;
696    bo->b.base.alignment_log2 = util_logbase2(alignment);
697    bo->b.base.usage = flags;
698    bo->b.base.size = size;
699    bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
700    bo->bo = buf_handle;
701    bo->va_handle = va_handle;
702    bo->kms_handle = kms_handle;
703 
704    if (initial_domain & RADEON_DOMAIN_VRAM)
705       aws->allocated_vram += align64(size, aws->info.gart_page_size);
706    else if (initial_domain & RADEON_DOMAIN_GTT)
707       aws->allocated_gtt += align64(size, aws->info.gart_page_size);
708 
709    amdgpu_add_buffer_to_global_list(aws, bo);
710 
711    return &bo->b;
712 
713 error_va_map:
714    ac_drm_va_range_free(va_handle);
715 
716 error_va_alloc:
717    ac_drm_bo_free(aws->dev, buf_handle);
718 
719 error_bo_alloc:
720    FREE(bo);
721    return NULL;
722 }
723 
amdgpu_bo_can_reclaim(struct amdgpu_winsys * aws,struct pb_buffer_lean * _buf)724 bool amdgpu_bo_can_reclaim(struct amdgpu_winsys *aws, struct pb_buffer_lean *_buf)
725 {
726    return amdgpu_bo_wait(&aws->dummy_sws.base, _buf, 0, RADEON_USAGE_READWRITE);
727 }
728 
amdgpu_bo_can_reclaim_slab(void * priv,struct pb_slab_entry * entry)729 bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
730 {
731    struct amdgpu_bo_slab_entry *bo = container_of(entry, struct amdgpu_bo_slab_entry, entry);
732 
733    return amdgpu_bo_can_reclaim(priv, &bo->b.base);
734 }
735 
get_slab_wasted_size(struct amdgpu_winsys * aws,struct amdgpu_bo_slab_entry * bo)736 static unsigned get_slab_wasted_size(struct amdgpu_winsys *aws, struct amdgpu_bo_slab_entry *bo)
737 {
738    assert(bo->b.base.size <= bo->entry.slab->entry_size);
739    assert(bo->b.base.size < (1 << bo->b.base.alignment_log2) ||
740           bo->b.base.size < 1 << aws->bo_slabs.min_order ||
741           bo->b.base.size > bo->entry.slab->entry_size / 2);
742    return bo->entry.slab->entry_size - bo->b.base.size;
743 }
744 
amdgpu_bo_slab_destroy(struct radeon_winsys * rws,struct pb_buffer_lean * _buf)745 static void amdgpu_bo_slab_destroy(struct radeon_winsys *rws, struct pb_buffer_lean *_buf)
746 {
747    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
748    struct amdgpu_bo_slab_entry *bo = get_slab_entry_bo(amdgpu_winsys_bo(_buf));
749 
750    if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
751       aws->slab_wasted_vram -= get_slab_wasted_size(aws, bo);
752    else
753       aws->slab_wasted_gtt -= get_slab_wasted_size(aws, bo);
754 
755    pb_slab_free(&aws->bo_slabs, &bo->entry);
756 }
757 
758 /* Return the power of two size of a slab entry matching the input size. */
get_slab_pot_entry_size(struct amdgpu_winsys * aws,unsigned size)759 static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *aws, unsigned size)
760 {
761    unsigned entry_size = util_next_power_of_two(size);
762    unsigned min_entry_size = 1 << aws->bo_slabs.min_order;
763 
764    return MAX2(entry_size, min_entry_size);
765 }
766 
767 /* Return the slab entry alignment. */
get_slab_entry_alignment(struct amdgpu_winsys * aws,unsigned size)768 static unsigned get_slab_entry_alignment(struct amdgpu_winsys *aws, unsigned size)
769 {
770    unsigned entry_size = get_slab_pot_entry_size(aws, size);
771 
772    if (size <= entry_size * 3 / 4)
773       return entry_size / 4;
774 
775    return entry_size;
776 }
777 
amdgpu_bo_slab_alloc(void * priv,unsigned heap,unsigned entry_size,unsigned group_index)778 struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_size,
779                                      unsigned group_index)
780 {
781    struct amdgpu_winsys *aws = priv;
782    enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
783    enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
784 
785    /* Determine the slab buffer size. */
786    unsigned max_entry_size = 1 << (aws->bo_slabs.min_order + aws->bo_slabs.num_orders - 1);
787 
788    assert(entry_size <= max_entry_size);
789 
790    /* The slab size is twice the size of the largest possible entry. */
791    unsigned slab_size = max_entry_size * 2;
792 
793    if (!util_is_power_of_two_nonzero(entry_size)) {
794       assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
795 
796       /* If the entry size is 3/4 of a power of two, we would waste space and not gain
797        * anything if we allocated only twice the power of two for the backing buffer:
798        *   2 * 3/4 = 1.5 usable with buffer size 2
799        *
800        * Allocating 5 times the entry size leads us to the next power of two and results
801        * in a much better memory utilization:
802        *   5 * 3/4 = 3.75 usable with buffer size 4
803        */
804       if (entry_size * 5 > slab_size)
805          slab_size = util_next_power_of_two(entry_size * 5);
806    }
807 
808    /* The largest slab should have the same size as the PTE fragment
809     * size to get faster address translation.
810     */
811    slab_size = MAX2(slab_size, aws->info.pte_fragment_size);
812 
813    flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING |
814             RADEON_FLAG_NO_SUBALLOC |
815             RADEON_FLAG_WINSYS_SLAB_BACKING;
816 
817    struct amdgpu_bo_real_reusable_slab *slab_bo =
818       (struct amdgpu_bo_real_reusable_slab*)amdgpu_bo_create(aws, slab_size, slab_size,
819                                                              domains, flags);
820    if (!slab_bo)
821       return NULL;
822 
823    /* The slab is not suballocated. */
824    assert(is_real_bo(&slab_bo->b.b.b));
825    assert(slab_bo->b.b.b.type == AMDGPU_BO_REAL_REUSABLE_SLAB);
826 
827    /* We can get a buffer from pb_cache that is slightly larger. */
828    slab_size = slab_bo->b.b.b.base.size;
829 
830    slab_bo->slab.num_entries = slab_size / entry_size;
831    slab_bo->slab.num_free = slab_bo->slab.num_entries;
832    slab_bo->slab.group_index = group_index;
833    slab_bo->slab.entry_size = entry_size;
834    slab_bo->entries = os_malloc_aligned(slab_bo->slab.num_entries * sizeof(*slab_bo->entries),
835                                         CACHE_LINE_SIZE);
836    if (!slab_bo->entries)
837       goto fail;
838 
839    memset(slab_bo->entries, 0, slab_bo->slab.num_entries * sizeof(*slab_bo->entries));
840    list_inithead(&slab_bo->slab.free);
841 
842    for (unsigned i = 0; i < slab_bo->slab.num_entries; ++i) {
843       struct amdgpu_bo_slab_entry *bo = &slab_bo->entries[i];
844 
845       bo->b.base.placement = domains;
846       bo->b.base.alignment_log2 = util_logbase2(get_slab_entry_alignment(aws, entry_size));
847       bo->b.base.size = entry_size;
848       bo->b.type = AMDGPU_BO_SLAB_ENTRY;
849 
850       bo->entry.slab = &slab_bo->slab;
851       list_addtail(&bo->entry.head, &slab_bo->slab.free);
852    }
853 
854    /* Wasted alignment due to slabs with 3/4 allocations being aligned to a power of two. */
855    assert(slab_bo->slab.num_entries * entry_size <= slab_size);
856    if (domains & RADEON_DOMAIN_VRAM)
857       aws->slab_wasted_vram += slab_size - slab_bo->slab.num_entries * entry_size;
858    else
859       aws->slab_wasted_gtt += slab_size - slab_bo->slab.num_entries * entry_size;
860 
861    return &slab_bo->slab;
862 
863 fail:
864    amdgpu_winsys_bo_reference(aws, (struct amdgpu_winsys_bo**)&slab_bo, NULL);
865    return NULL;
866 }
867 
amdgpu_bo_slab_free(struct amdgpu_winsys * aws,struct pb_slab * slab)868 void amdgpu_bo_slab_free(struct amdgpu_winsys *aws, struct pb_slab *slab)
869 {
870    struct amdgpu_bo_real_reusable_slab *bo = get_bo_from_slab(slab);
871    unsigned slab_size = bo->b.b.b.base.size;
872 
873    assert(bo->slab.num_entries * bo->slab.entry_size <= slab_size);
874    if (bo->b.b.b.base.placement & RADEON_DOMAIN_VRAM)
875       aws->slab_wasted_vram -= slab_size - bo->slab.num_entries * bo->slab.entry_size;
876    else
877       aws->slab_wasted_gtt -= slab_size - bo->slab.num_entries * bo->slab.entry_size;
878 
879    for (unsigned i = 0; i < bo->slab.num_entries; ++i)
880       amdgpu_bo_remove_fences(&bo->entries[i].b);
881 
882    os_free_aligned(bo->entries);
883    amdgpu_winsys_bo_reference(aws, (struct amdgpu_winsys_bo**)&bo, NULL);
884 }
885 
886 #if DEBUG_SPARSE_COMMITS
887 static void
sparse_dump(struct amdgpu_bo_sparse * bo,const char * func)888 sparse_dump(struct amdgpu_bo_sparse *bo, const char *func)
889 {
890    fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n"
891                    "Commitments:\n",
892            __func__, bo, bo->b.base.size, bo->num_va_pages, func);
893 
894    struct amdgpu_sparse_backing *span_backing = NULL;
895    uint32_t span_first_backing_page = 0;
896    uint32_t span_first_va_page = 0;
897    uint32_t va_page = 0;
898 
899    for (;;) {
900       struct amdgpu_sparse_backing *backing = 0;
901       uint32_t backing_page = 0;
902 
903       if (va_page < bo->num_va_pages) {
904          backing = bo->commitments[va_page].backing;
905          backing_page = bo->commitments[va_page].page;
906       }
907 
908       if (span_backing &&
909           (backing != span_backing ||
910            backing_page != span_first_backing_page + (va_page - span_first_va_page))) {
911          fprintf(stderr, " %u..%u: backing=%p:%u..%u\n",
912                  span_first_va_page, va_page - 1, span_backing,
913                  span_first_backing_page,
914                  span_first_backing_page + (va_page - span_first_va_page) - 1);
915 
916          span_backing = NULL;
917       }
918 
919       if (va_page >= bo->num_va_pages)
920          break;
921 
922       if (backing && !span_backing) {
923          span_backing = backing;
924          span_first_backing_page = backing_page;
925          span_first_va_page = va_page;
926       }
927 
928       va_page++;
929    }
930 
931    fprintf(stderr, "Backing:\n");
932 
933    list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->backing, list) {
934       fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->b.base.size);
935       for (unsigned i = 0; i < backing->num_chunks; ++i)
936          fprintf(stderr, "   %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end);
937    }
938 }
939 #endif
940 
941 /*
942  * Attempt to allocate the given number of backing pages. Fewer pages may be
943  * allocated (depending on the fragmentation of existing backing buffers),
944  * which will be reflected by a change to *pnum_pages.
945  */
946 static struct amdgpu_sparse_backing *
sparse_backing_alloc(struct amdgpu_winsys * aws,struct amdgpu_bo_sparse * bo,uint32_t * pstart_page,uint32_t * pnum_pages)947 sparse_backing_alloc(struct amdgpu_winsys *aws, struct amdgpu_bo_sparse *bo,
948                      uint32_t *pstart_page, uint32_t *pnum_pages)
949 {
950    struct amdgpu_sparse_backing *best_backing;
951    unsigned best_idx;
952    uint32_t best_num_pages;
953 
954    best_backing = NULL;
955    best_idx = 0;
956    best_num_pages = 0;
957 
958    /* This is a very simple and inefficient best-fit algorithm. */
959    list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->backing, list) {
960       for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {
961          uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;
962          if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||
963             (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {
964             best_backing = backing;
965             best_idx = idx;
966             best_num_pages = cur_num_pages;
967          }
968       }
969    }
970 
971    /* Allocate a new backing buffer if necessary. */
972    if (!best_backing) {
973       struct pb_buffer_lean *buf;
974       uint64_t size;
975       uint32_t pages;
976 
977       best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);
978       if (!best_backing)
979          return NULL;
980 
981       best_backing->max_chunks = 4;
982       best_backing->chunks = CALLOC(best_backing->max_chunks,
983                                     sizeof(*best_backing->chunks));
984       if (!best_backing->chunks) {
985          FREE(best_backing);
986          return NULL;
987       }
988 
989       assert(bo->num_backing_pages < DIV_ROUND_UP(bo->b.base.size, RADEON_SPARSE_PAGE_SIZE));
990 
991       size = MIN3(bo->b.base.size / 16,
992                   8 * 1024 * 1024,
993                   bo->b.base.size - (uint64_t)bo->num_backing_pages * RADEON_SPARSE_PAGE_SIZE);
994       size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);
995 
996       buf = amdgpu_bo_create(aws, size, RADEON_SPARSE_PAGE_SIZE,
997                              bo->b.base.placement,
998                              (bo->b.base.usage & ~RADEON_FLAG_SPARSE &
999                               /* Set the interprocess sharing flag to disable pb_cache because
1000                                * amdgpu_bo_wait doesn't wait for active CS jobs.
1001                                */
1002                               ~RADEON_FLAG_NO_INTERPROCESS_SHARING) | RADEON_FLAG_NO_SUBALLOC);
1003       if (!buf) {
1004          FREE(best_backing->chunks);
1005          FREE(best_backing);
1006          return NULL;
1007       }
1008 
1009       /* We might have gotten a bigger buffer than requested via caching. */
1010       pages = buf->size / RADEON_SPARSE_PAGE_SIZE;
1011 
1012       best_backing->bo = get_real_bo(amdgpu_winsys_bo(buf));
1013       best_backing->num_chunks = 1;
1014       best_backing->chunks[0].begin = 0;
1015       best_backing->chunks[0].end = pages;
1016 
1017       list_add(&best_backing->list, &bo->backing);
1018       bo->num_backing_pages += pages;
1019 
1020       best_idx = 0;
1021       best_num_pages = pages;
1022    }
1023 
1024    *pnum_pages = MIN2(*pnum_pages, best_num_pages);
1025    *pstart_page = best_backing->chunks[best_idx].begin;
1026    best_backing->chunks[best_idx].begin += *pnum_pages;
1027 
1028    if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {
1029       memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],
1030               sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));
1031       best_backing->num_chunks--;
1032    }
1033 
1034    return best_backing;
1035 }
1036 
1037 static void
sparse_free_backing_buffer(struct amdgpu_winsys * aws,struct amdgpu_bo_sparse * bo,struct amdgpu_sparse_backing * backing)1038 sparse_free_backing_buffer(struct amdgpu_winsys *aws, struct amdgpu_bo_sparse *bo,
1039                            struct amdgpu_sparse_backing *backing)
1040 {
1041    bo->num_backing_pages -= backing->bo->b.base.size / RADEON_SPARSE_PAGE_SIZE;
1042 
1043    /* Add fences from bo to backing->bo. */
1044    simple_mtx_lock(&aws->bo_fence_lock);
1045    u_foreach_bit(i, bo->b.fences.valid_fence_mask) {
1046       add_seq_no_to_list(aws, &backing->bo->b.fences, i, bo->b.fences.seq_no[i]);
1047    }
1048    simple_mtx_unlock(&aws->bo_fence_lock);
1049 
1050    list_del(&backing->list);
1051    amdgpu_winsys_bo_reference(aws, (struct amdgpu_winsys_bo**)&backing->bo, NULL);
1052    FREE(backing->chunks);
1053    FREE(backing);
1054 }
1055 
1056 /*
1057  * Return a range of pages from the given backing buffer back into the
1058  * free structure.
1059  */
1060 static bool
sparse_backing_free(struct amdgpu_winsys * aws,struct amdgpu_bo_sparse * bo,struct amdgpu_sparse_backing * backing,uint32_t start_page,uint32_t num_pages)1061 sparse_backing_free(struct amdgpu_winsys *aws, struct amdgpu_bo_sparse *bo,
1062                     struct amdgpu_sparse_backing *backing,
1063                     uint32_t start_page, uint32_t num_pages)
1064 {
1065    uint32_t end_page = start_page + num_pages;
1066    unsigned low = 0;
1067    unsigned high = backing->num_chunks;
1068 
1069    /* Find the first chunk with begin >= start_page. */
1070    while (low < high) {
1071       unsigned mid = low + (high - low) / 2;
1072 
1073       if (backing->chunks[mid].begin >= start_page)
1074          high = mid;
1075       else
1076          low = mid + 1;
1077    }
1078 
1079    assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);
1080    assert(low == 0 || backing->chunks[low - 1].end <= start_page);
1081 
1082    if (low > 0 && backing->chunks[low - 1].end == start_page) {
1083       backing->chunks[low - 1].end = end_page;
1084 
1085       if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
1086          backing->chunks[low - 1].end = backing->chunks[low].end;
1087          memmove(&backing->chunks[low], &backing->chunks[low + 1],
1088                  sizeof(*backing->chunks) * (backing->num_chunks - low - 1));
1089          backing->num_chunks--;
1090       }
1091    } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
1092       backing->chunks[low].begin = start_page;
1093    } else {
1094       if (backing->num_chunks >= backing->max_chunks) {
1095          unsigned new_max_chunks = 2 * backing->max_chunks;
1096          struct amdgpu_sparse_backing_chunk *new_chunks =
1097             REALLOC(backing->chunks,
1098                     sizeof(*backing->chunks) * backing->max_chunks,
1099                     sizeof(*backing->chunks) * new_max_chunks);
1100          if (!new_chunks)
1101             return false;
1102 
1103          backing->max_chunks = new_max_chunks;
1104          backing->chunks = new_chunks;
1105       }
1106 
1107       memmove(&backing->chunks[low + 1], &backing->chunks[low],
1108               sizeof(*backing->chunks) * (backing->num_chunks - low));
1109       backing->chunks[low].begin = start_page;
1110       backing->chunks[low].end = end_page;
1111       backing->num_chunks++;
1112    }
1113 
1114    if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&
1115        backing->chunks[0].end == backing->bo->b.base.size / RADEON_SPARSE_PAGE_SIZE)
1116       sparse_free_backing_buffer(aws, bo, backing);
1117 
1118    return true;
1119 }
1120 
amdgpu_bo_sparse_destroy(struct radeon_winsys * rws,struct pb_buffer_lean * _buf)1121 static void amdgpu_bo_sparse_destroy(struct radeon_winsys *rws, struct pb_buffer_lean *_buf)
1122 {
1123    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1124    struct amdgpu_bo_sparse *bo = get_sparse_bo(amdgpu_winsys_bo(_buf));
1125    int r;
1126 
1127    r = amdgpu_bo_va_op_common(aws, amdgpu_winsys_bo(_buf), 0, true, NULL, 0,
1128                               (uint64_t)bo->num_va_pages * RADEON_SPARSE_PAGE_SIZE,
1129                               amdgpu_va_get_start_addr(bo->va_handle), 0, AMDGPU_VA_OP_CLEAR);
1130    if (r) {
1131       fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);
1132    }
1133 
1134    while (!list_is_empty(&bo->backing)) {
1135       sparse_free_backing_buffer(aws, bo,
1136                                  container_of(bo->backing.next,
1137                                               struct amdgpu_sparse_backing, list));
1138    }
1139 
1140    ac_drm_va_range_free(bo->va_handle);
1141    FREE(bo->commitments);
1142    simple_mtx_destroy(&bo->commit_lock);
1143    FREE(bo);
1144 }
1145 
1146 static struct pb_buffer_lean *
amdgpu_bo_sparse_create(struct amdgpu_winsys * aws,uint64_t size,enum radeon_bo_domain domain,enum radeon_bo_flag flags)1147 amdgpu_bo_sparse_create(struct amdgpu_winsys *aws, uint64_t size,
1148                         enum radeon_bo_domain domain,
1149                         enum radeon_bo_flag flags)
1150 {
1151    struct amdgpu_bo_sparse *bo;
1152    uint64_t map_size;
1153    uint64_t va_gap_size;
1154    int r;
1155 
1156    /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers
1157     * that exceed this limit. This is not really a restriction: we don't have
1158     * that much virtual address space anyway.
1159     */
1160    if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)
1161       return NULL;
1162 
1163    bo = CALLOC_STRUCT(amdgpu_bo_sparse);
1164    if (!bo)
1165       return NULL;
1166 
1167    simple_mtx_init(&bo->commit_lock, mtx_plain);
1168    pipe_reference_init(&bo->b.base.reference, 1);
1169    bo->b.base.placement = domain;
1170    bo->b.base.alignment_log2 = util_logbase2(RADEON_SPARSE_PAGE_SIZE);
1171    bo->b.base.usage = flags;
1172    bo->b.base.size = size;
1173    bo->b.unique_id =  __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
1174    bo->b.type = AMDGPU_BO_SPARSE;
1175 
1176    bo->num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1177    bo->commitments = CALLOC(bo->num_va_pages, sizeof(*bo->commitments));
1178    if (!bo->commitments)
1179       goto error_alloc_commitments;
1180 
1181    list_inithead(&bo->backing);
1182 
1183    /* For simplicity, we always map a multiple of the page size. */
1184    map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);
1185    va_gap_size = aws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;
1186 
1187    uint64_t gpu_address;
1188    r = ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general,
1189                                           map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,
1190                                           0, &gpu_address, &bo->va_handle, AMDGPU_VA_RANGE_HIGH);
1191    if (r)
1192       goto error_va_alloc;
1193 
1194    r = amdgpu_bo_va_op_common(aws, NULL, 0, false, &bo->vm_timeline_point, 0, map_size,
1195                               gpu_address, AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
1196    if (r)
1197       goto error_va_map;
1198 
1199    return &bo->b.base;
1200 
1201 error_va_map:
1202    ac_drm_va_range_free(bo->va_handle);
1203 error_va_alloc:
1204    FREE(bo->commitments);
1205 error_alloc_commitments:
1206    simple_mtx_destroy(&bo->commit_lock);
1207    FREE(bo);
1208    return NULL;
1209 }
1210 
1211 static bool
amdgpu_bo_sparse_commit(struct radeon_winsys * rws,struct pb_buffer_lean * buf,uint64_t offset,uint64_t size,bool commit)1212 amdgpu_bo_sparse_commit(struct radeon_winsys *rws, struct pb_buffer_lean *buf,
1213                         uint64_t offset, uint64_t size, bool commit)
1214 {
1215    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1216    struct amdgpu_bo_sparse *bo = get_sparse_bo(amdgpu_winsys_bo(buf));
1217    struct amdgpu_sparse_commitment *comm;
1218    uint32_t va_page, end_va_page;
1219    bool ok = true;
1220    int r;
1221 
1222    assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);
1223    assert(offset <= bo->b.base.size);
1224    assert(size <= bo->b.base.size - offset);
1225    assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->b.base.size);
1226 
1227    comm = bo->commitments;
1228    va_page = offset / RADEON_SPARSE_PAGE_SIZE;
1229    end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1230 
1231    simple_mtx_lock(&bo->commit_lock);
1232 
1233 #if DEBUG_SPARSE_COMMITS
1234    sparse_dump(bo, __func__);
1235 #endif
1236 
1237    if (commit) {
1238       while (va_page < end_va_page) {
1239          uint32_t span_va_page;
1240 
1241          /* Skip pages that are already committed. */
1242          if (comm[va_page].backing) {
1243             va_page++;
1244             continue;
1245          }
1246 
1247          /* Determine length of uncommitted span. */
1248          span_va_page = va_page;
1249          while (va_page < end_va_page && !comm[va_page].backing)
1250             va_page++;
1251 
1252          /* Fill the uncommitted span with chunks of backing memory. */
1253          while (span_va_page < va_page) {
1254             struct amdgpu_sparse_backing *backing;
1255             uint32_t backing_start, backing_size;
1256 
1257             backing_size = va_page - span_va_page;
1258             backing = sparse_backing_alloc(aws, bo, &backing_start, &backing_size);
1259             if (!backing) {
1260                ok = false;
1261                goto out;
1262             }
1263 
1264             r = amdgpu_bo_va_op_common(aws, amdgpu_winsys_bo(buf), backing->bo->kms_handle,
1265                                        true, &bo->vm_timeline_point,
1266                                        (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,
1267                                        (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,
1268                                        amdgpu_va_get_start_addr(bo->va_handle) +
1269                                        (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,
1270                                        AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
1271                                           AMDGPU_VM_PAGE_EXECUTABLE, AMDGPU_VA_OP_REPLACE);
1272             if (r) {
1273                ok = sparse_backing_free(aws, bo, backing, backing_start, backing_size);
1274                assert(ok && "sufficient memory should already be allocated");
1275 
1276                ok = false;
1277                goto out;
1278             }
1279 
1280             while (backing_size) {
1281                comm[span_va_page].backing = backing;
1282                comm[span_va_page].page = backing_start;
1283                span_va_page++;
1284                backing_start++;
1285                backing_size--;
1286             }
1287          }
1288       }
1289    } else {
1290       r = amdgpu_bo_va_op_common(aws, amdgpu_winsys_bo(buf), 0, true, &bo->vm_timeline_point,
1291                                  0, (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,
1292                                  amdgpu_va_get_start_addr(bo->va_handle) +
1293                                     (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,
1294                                  AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
1295       if (r) {
1296          ok = false;
1297          goto out;
1298       }
1299 
1300       while (va_page < end_va_page) {
1301          struct amdgpu_sparse_backing *backing;
1302          uint32_t backing_start;
1303          uint32_t span_pages;
1304 
1305          /* Skip pages that are already uncommitted. */
1306          if (!comm[va_page].backing) {
1307             va_page++;
1308             continue;
1309          }
1310 
1311          /* Group contiguous spans of pages. */
1312          backing = comm[va_page].backing;
1313          backing_start = comm[va_page].page;
1314          comm[va_page].backing = NULL;
1315 
1316          span_pages = 1;
1317          va_page++;
1318 
1319          while (va_page < end_va_page &&
1320                 comm[va_page].backing == backing &&
1321                 comm[va_page].page == backing_start + span_pages) {
1322             comm[va_page].backing = NULL;
1323             va_page++;
1324             span_pages++;
1325          }
1326 
1327          if (!sparse_backing_free(aws, bo, backing, backing_start, span_pages)) {
1328             /* Couldn't allocate tracking data structures, so we have to leak */
1329             fprintf(stderr, "amdgpu: leaking PRT backing memory\n");
1330             ok = false;
1331          }
1332       }
1333    }
1334 out:
1335 
1336    simple_mtx_unlock(&bo->commit_lock);
1337 
1338    return ok;
1339 }
1340 
1341 static unsigned
amdgpu_bo_find_next_committed_memory(struct pb_buffer_lean * buf,uint64_t range_offset,unsigned * range_size)1342 amdgpu_bo_find_next_committed_memory(struct pb_buffer_lean *buf,
1343                                      uint64_t range_offset, unsigned *range_size)
1344 {
1345    struct amdgpu_bo_sparse *bo = get_sparse_bo(amdgpu_winsys_bo(buf));
1346    struct amdgpu_sparse_commitment *comm;
1347    uint32_t va_page, end_va_page;
1348    uint32_t span_va_page, start_va_page;
1349    unsigned uncommitted_range_prev, uncommitted_range_next;
1350 
1351    if (*range_size == 0)
1352       return 0;
1353 
1354    assert(*range_size + range_offset <= bo->b.base.size);
1355 
1356    uncommitted_range_prev = uncommitted_range_next = 0;
1357    comm = bo->commitments;
1358    start_va_page = va_page = range_offset / RADEON_SPARSE_PAGE_SIZE;
1359    end_va_page = (*range_size + range_offset) / RADEON_SPARSE_PAGE_SIZE;
1360 
1361    simple_mtx_lock(&bo->commit_lock);
1362    /* Lookup the first committed page with backing physical storage */
1363    while (va_page < end_va_page && !comm[va_page].backing)
1364       va_page++;
1365 
1366    /* Fisrt committed page lookup failed, return early. */
1367    if (va_page == end_va_page && !comm[va_page].backing) {
1368       uncommitted_range_prev = *range_size;
1369       *range_size = 0;
1370       simple_mtx_unlock(&bo->commit_lock);
1371       return uncommitted_range_prev;
1372    }
1373 
1374    /* Lookup the first uncommitted page without backing physical storage */
1375    span_va_page = va_page;
1376    while (va_page < end_va_page && comm[va_page].backing)
1377       va_page++;
1378    simple_mtx_unlock(&bo->commit_lock);
1379 
1380    /* Calc byte count that need to skip before committed range */
1381    if (span_va_page != start_va_page)
1382       uncommitted_range_prev = span_va_page * RADEON_SPARSE_PAGE_SIZE - range_offset;
1383 
1384    /* Calc byte count that need to skip after committed range */
1385    if (va_page != end_va_page || !comm[va_page].backing) {
1386       uncommitted_range_next = *range_size + range_offset - va_page * RADEON_SPARSE_PAGE_SIZE;
1387    }
1388 
1389    /* Calc size of first committed part */
1390    *range_size = *range_size - uncommitted_range_next - uncommitted_range_prev;
1391    return *range_size ? uncommitted_range_prev : uncommitted_range_prev + uncommitted_range_next;
1392 }
1393 
amdgpu_buffer_get_metadata(struct radeon_winsys * rws,struct pb_buffer_lean * _buf,struct radeon_bo_metadata * md,struct radeon_surf * surf)1394 static void amdgpu_buffer_get_metadata(struct radeon_winsys *rws,
1395                                        struct pb_buffer_lean *_buf,
1396                                        struct radeon_bo_metadata *md,
1397                                        struct radeon_surf *surf)
1398 {
1399    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1400    struct amdgpu_bo_real *bo = get_real_bo(amdgpu_winsys_bo(_buf));
1401    struct amdgpu_bo_info info = {0};
1402    uint32_t md_version, md_flags;
1403    enum amd_gfx_level gfx_level = aws->info.gfx_level;
1404    int r;
1405 
1406    r = ac_drm_bo_query_info(aws->dev, bo->kms_handle, &info);
1407    if (r)
1408       return;
1409 
1410    md->size_metadata = info.metadata.size_metadata;
1411    memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
1412 
1413    md_version = md->metadata[0] & 0xffff;
1414    if (md_version >= 3 && md->size_metadata > 4) {
1415       md_flags = md->metadata[0] >> 16;
1416       if (md_flags & (1u << AC_SURF_METADATA_FLAG_FAMILY_OVERRIDEN_BIT)) {
1417          /* The overriden gfx_level is always the last dword. */
1418          gfx_level = md->metadata[md->size_metadata / 4 - 1];
1419 
1420          /* Fallback to the default value if the value we got is incorrect. */
1421          if (gfx_level < GFX6 || gfx_level >= NUM_GFX_VERSIONS)
1422             gfx_level = aws->info.gfx_level;
1423       }
1424    }
1425 
1426    ac_surface_apply_bo_metadata(gfx_level, surf, info.metadata.tiling_info,
1427                                 &md->mode);
1428 }
1429 
amdgpu_buffer_set_metadata(struct radeon_winsys * rws,struct pb_buffer_lean * _buf,struct radeon_bo_metadata * md,struct radeon_surf * surf)1430 static void amdgpu_buffer_set_metadata(struct radeon_winsys *rws,
1431                                        struct pb_buffer_lean *_buf,
1432                                        struct radeon_bo_metadata *md,
1433                                        struct radeon_surf *surf)
1434 {
1435    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1436    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1437    struct amdgpu_bo_real *real = is_real_bo(bo) ? get_real_bo(bo) : get_slab_entry_real_bo(bo);
1438    struct amdgpu_bo_metadata metadata = {0};
1439 
1440    ac_surface_compute_bo_metadata(&aws->info, surf, &metadata.tiling_info);
1441 
1442    metadata.size_metadata = md->size_metadata;
1443    memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
1444 
1445    ac_drm_bo_set_metadata(aws->dev, real->kms_handle, &metadata);
1446 }
1447 
1448 struct pb_buffer_lean *
amdgpu_bo_create(struct amdgpu_winsys * aws,uint64_t size,unsigned alignment,enum radeon_bo_domain domain,enum radeon_bo_flag flags)1449 amdgpu_bo_create(struct amdgpu_winsys *aws,
1450                  uint64_t size,
1451                  unsigned alignment,
1452                  enum radeon_bo_domain domain,
1453                  enum radeon_bo_flag flags)
1454 {
1455    struct amdgpu_winsys_bo *bo;
1456 
1457    radeon_canonicalize_bo_flags(&domain, &flags);
1458 
1459    /* Handle sparse buffers first. */
1460    if (flags & RADEON_FLAG_SPARSE) {
1461       assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
1462 
1463       return amdgpu_bo_sparse_create(aws, size, domain, flags);
1464    }
1465 
1466    unsigned max_slab_entry_size = 1 << (aws->bo_slabs.min_order + aws->bo_slabs.num_orders - 1);
1467    int heap = radeon_get_heap_index(domain, flags);
1468 
1469    /* Sub-allocate small buffers from slabs. */
1470    if (heap >= 0 && size <= max_slab_entry_size) {
1471       struct pb_slab_entry *entry;
1472       unsigned alloc_size = size;
1473 
1474       /* Always use slabs for sizes less than 4 KB because the kernel aligns
1475        * everything to 4 KB.
1476        */
1477       if (size < alignment && alignment <= 4 * 1024)
1478          alloc_size = alignment;
1479 
1480       if (alignment > get_slab_entry_alignment(aws, alloc_size)) {
1481          /* 3/4 allocations can return too small alignment. Try again with a power of two
1482           * allocation size.
1483           */
1484          unsigned pot_size = get_slab_pot_entry_size(aws, alloc_size);
1485 
1486          if (alignment <= pot_size) {
1487             /* This size works but wastes some memory to fulfil the alignment. */
1488             alloc_size = pot_size;
1489          } else {
1490             goto no_slab; /* can't fulfil alignment requirements */
1491          }
1492       }
1493 
1494       entry = pb_slab_alloc(&aws->bo_slabs, alloc_size, heap);
1495       if (!entry) {
1496          /* Clean up buffer managers and try again. */
1497          amdgpu_clean_up_buffer_managers(aws);
1498 
1499          entry = pb_slab_alloc(&aws->bo_slabs, alloc_size, heap);
1500       }
1501       if (!entry)
1502          return NULL;
1503 
1504       struct amdgpu_bo_slab_entry *slab_bo = container_of(entry, struct amdgpu_bo_slab_entry, entry);
1505       pipe_reference_init(&slab_bo->b.base.reference, 1);
1506       slab_bo->b.base.size = size;
1507       slab_bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
1508       assert(alignment <= 1 << slab_bo->b.base.alignment_log2);
1509 
1510       if (domain & RADEON_DOMAIN_VRAM)
1511          aws->slab_wasted_vram += get_slab_wasted_size(aws, slab_bo);
1512       else
1513          aws->slab_wasted_gtt += get_slab_wasted_size(aws, slab_bo);
1514 
1515       return &slab_bo->b.base;
1516    }
1517 no_slab:
1518 
1519    /* Align size to page size. This is the minimum alignment for normal
1520     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
1521     * like constant/uniform buffers, can benefit from better and more reuse.
1522     */
1523    if (domain & RADEON_DOMAIN_VRAM_GTT) {
1524       size = align64(size, aws->info.gart_page_size);
1525       alignment = align(alignment, aws->info.gart_page_size);
1526    }
1527 
1528    bool use_reusable_pool = !(domain & RADEON_DOMAIN_DOORBELL) &&
1529       (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) &&
1530       !(flags & (RADEON_FLAG_DISCARDABLE | RADEON_FLAG_CLEAR_VRAM));
1531 
1532    if (use_reusable_pool) {
1533        /* RADEON_FLAG_NO_SUBALLOC is irrelevant for the cache. */
1534        heap = radeon_get_heap_index(domain, flags & ~RADEON_FLAG_NO_SUBALLOC);
1535        assert(heap >= 0 && heap < RADEON_NUM_HEAPS);
1536 
1537        /* Get a buffer from the cache. */
1538        bo = (struct amdgpu_winsys_bo*)
1539             pb_cache_reclaim_buffer(&aws->bo_cache, size, alignment, 0, heap);
1540        if (bo) {
1541           /* If the buffer is amdgpu_bo_real_reusable, but we need amdgpu_bo_real_reusable_slab,
1542            * keep the allocation but make the structure bigger.
1543            */
1544           if (flags & RADEON_FLAG_WINSYS_SLAB_BACKING && bo->type == AMDGPU_BO_REAL_REUSABLE) {
1545              const unsigned orig_size = sizeof(struct amdgpu_bo_real_reusable);
1546              const unsigned new_size = sizeof(struct amdgpu_bo_real_reusable_slab);
1547              struct amdgpu_winsys_bo *new_bo =
1548                 (struct amdgpu_winsys_bo*)REALLOC(bo, orig_size, new_size);
1549 
1550              if (!new_bo) {
1551                 amdgpu_winsys_bo_reference(aws, &bo, NULL);
1552                 return NULL;
1553              }
1554 
1555              memset((uint8_t*)new_bo + orig_size, 0, new_size - orig_size);
1556              bo = new_bo;
1557              bo->type = AMDGPU_BO_REAL_REUSABLE_SLAB;
1558           }
1559           return &bo->base;
1560        }
1561    }
1562 
1563    /* Create a new one. */
1564    bo = amdgpu_create_bo(aws, size, alignment, domain, flags, heap);
1565    if (!bo) {
1566       /* Clean up buffer managers and try again. */
1567       amdgpu_clean_up_buffer_managers(aws);
1568 
1569       bo = amdgpu_create_bo(aws, size, alignment, domain, flags, heap);
1570       if (!bo)
1571          return NULL;
1572    }
1573 
1574    return &bo->base;
1575 }
1576 
1577 static struct pb_buffer_lean *
amdgpu_buffer_create(struct radeon_winsys * rws,uint64_t size,unsigned alignment,enum radeon_bo_domain domain,enum radeon_bo_flag flags)1578 amdgpu_buffer_create(struct radeon_winsys *rws,
1579                      uint64_t size,
1580                      unsigned alignment,
1581                      enum radeon_bo_domain domain,
1582                      enum radeon_bo_flag flags)
1583 {
1584    struct pb_buffer_lean * res = amdgpu_bo_create(amdgpu_winsys(rws), size, alignment, domain,
1585                            flags);
1586    return res;
1587 }
1588 
amdgpu_bo_from_handle(struct radeon_winsys * rws,struct winsys_handle * whandle,unsigned vm_alignment,bool is_prime_linear_buffer)1589 static struct pb_buffer_lean *amdgpu_bo_from_handle(struct radeon_winsys *rws,
1590                                                struct winsys_handle *whandle,
1591                                                unsigned vm_alignment,
1592                                                bool is_prime_linear_buffer)
1593 {
1594    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1595    struct amdgpu_bo_real *bo = NULL;
1596    enum amdgpu_bo_handle_type type;
1597    struct ac_drm_bo_import_result result = {0};
1598    uint64_t va;
1599    amdgpu_va_handle va_handle = NULL;
1600    struct amdgpu_bo_info info = {0};
1601    enum radeon_bo_domain initial = 0;
1602    enum radeon_bo_flag flags = 0;
1603    int r;
1604 
1605    switch (whandle->type) {
1606    case WINSYS_HANDLE_TYPE_SHARED:
1607       assert(!aws->info.is_virtio); /* Legacy-path, not handled */
1608       type = amdgpu_bo_handle_type_gem_flink_name;
1609       break;
1610    case WINSYS_HANDLE_TYPE_FD:
1611       type = amdgpu_bo_handle_type_dma_buf_fd;
1612       break;
1613    default:
1614       return NULL;
1615    }
1616 
1617    r = ac_drm_bo_import(aws->dev, type, whandle->handle, &result);
1618    if (r)
1619       return NULL;
1620 
1621    simple_mtx_lock(&aws->bo_export_table_lock);
1622    bo = util_hash_table_get(aws->bo_export_table, result.bo.abo);
1623 
1624    /* If the amdgpu_winsys_bo instance already exists, bump the reference
1625     * counter and return it.
1626     */
1627    if (bo) {
1628       p_atomic_inc(&bo->b.base.reference.count);
1629       simple_mtx_unlock(&aws->bo_export_table_lock);
1630 
1631       /* Release the buffer handle, because we don't need it anymore.
1632        * This function is returning an existing buffer, which has its own
1633        * handle.
1634        */
1635       ac_drm_bo_free(aws->dev, result.bo);
1636       return &bo->b.base;
1637    }
1638 
1639    uint32_t kms_handle;
1640    ac_drm_bo_export(aws->dev, result.bo, amdgpu_bo_handle_type_kms, &kms_handle);
1641 
1642    /* Get initial domains. */
1643    r = ac_drm_bo_query_info(aws->dev, kms_handle, &info);
1644    if (r)
1645       goto error;
1646 
1647    r = ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general,
1648                                           result.alloc_size,
1649                                           amdgpu_get_optimal_alignment(aws, result.alloc_size,
1650                                                                        vm_alignment),
1651                                           0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH);
1652    if (r)
1653       goto error;
1654 
1655    bo = CALLOC_STRUCT(amdgpu_bo_real);
1656    if (!bo)
1657       goto error;
1658 
1659    r = amdgpu_bo_va_op_common(aws, NULL, kms_handle, false, &bo->vm_timeline_point, 0,
1660                               result.alloc_size, va, AMDGPU_VM_PAGE_READABLE |
1661                                  AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE |
1662                                  (is_prime_linear_buffer ? AMDGPU_VM_MTYPE_UC : 0),
1663                               AMDGPU_VA_OP_MAP);
1664    if (r)
1665       goto error;
1666 
1667    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
1668       initial |= RADEON_DOMAIN_VRAM;
1669    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
1670       initial |= RADEON_DOMAIN_GTT;
1671    if (info.alloc_flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)
1672       flags |= RADEON_FLAG_NO_CPU_ACCESS;
1673    if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
1674       flags |= RADEON_FLAG_GTT_WC;
1675    if (info.alloc_flags & AMDGPU_GEM_CREATE_ENCRYPTED) {
1676       /* Imports are always possible even if the importer isn't using TMZ.
1677        * For instance libweston needs to import the buffer to be able to determine
1678        * if it can be used for scanout.
1679        */
1680       flags |= RADEON_FLAG_ENCRYPTED;
1681       *((bool*)&rws->uses_secure_bos) = true;
1682    }
1683    if (info.alloc_flags & AMDGPU_GEM_CREATE_GFX12_DCC)
1684       flags |= RADEON_FLAG_GFX12_ALLOW_DCC;
1685 
1686    /* Initialize the structure. */
1687    pipe_reference_init(&bo->b.base.reference, 1);
1688    bo->b.base.placement = initial;
1689    bo->b.base.alignment_log2 = util_logbase2(info.phys_alignment ?
1690                                              info.phys_alignment : aws->info.gart_page_size);
1691    bo->b.base.usage = flags;
1692    bo->b.base.size = result.alloc_size;
1693    bo->b.type = AMDGPU_BO_REAL;
1694    bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
1695    simple_mtx_init(&bo->map_lock, mtx_plain);
1696    bo->bo = result.bo;
1697    bo->va_handle = va_handle;
1698    bo->kms_handle = kms_handle;
1699    bo->is_shared = true;
1700 
1701    if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
1702       aws->allocated_vram += align64(bo->b.base.size, aws->info.gart_page_size);
1703    else if (bo->b.base.placement & RADEON_DOMAIN_GTT)
1704       aws->allocated_gtt += align64(bo->b.base.size, aws->info.gart_page_size);
1705 
1706    amdgpu_add_buffer_to_global_list(aws, bo);
1707 
1708    _mesa_hash_table_insert(aws->bo_export_table, bo->bo.abo, bo);
1709    simple_mtx_unlock(&aws->bo_export_table_lock);
1710 
1711    return &bo->b.base;
1712 
1713 error:
1714    simple_mtx_unlock(&aws->bo_export_table_lock);
1715    if (bo)
1716       FREE(bo);
1717    if (va_handle)
1718       ac_drm_va_range_free(va_handle);
1719    ac_drm_bo_free(aws->dev, result.bo);
1720    return NULL;
1721 }
1722 
amdgpu_bo_get_handle(struct radeon_winsys * rws,struct pb_buffer_lean * buffer,struct winsys_handle * whandle)1723 static bool amdgpu_bo_get_handle(struct radeon_winsys *rws,
1724                                  struct pb_buffer_lean *buffer,
1725                                  struct winsys_handle *whandle)
1726 {
1727    struct amdgpu_screen_winsys *sws = amdgpu_screen_winsys(rws);
1728    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1729    enum amdgpu_bo_handle_type type;
1730    struct hash_entry *entry;
1731    int r;
1732 
1733    /* Don't allow exports of slab entries and sparse buffers. */
1734    if (!is_real_bo(amdgpu_winsys_bo(buffer)))
1735       return false;
1736 
1737    struct amdgpu_bo_real *bo = get_real_bo(amdgpu_winsys_bo(buffer));
1738 
1739    /* This removes the REUSABLE enum if it's set. */
1740    bo->b.type = AMDGPU_BO_REAL;
1741 
1742    switch (whandle->type) {
1743    case WINSYS_HANDLE_TYPE_SHARED:
1744       /* This is a legacy code-path, not supported by virtio. */
1745       assert(!aws->info.is_virtio);
1746       type = amdgpu_bo_handle_type_gem_flink_name;
1747       break;
1748    case WINSYS_HANDLE_TYPE_KMS:
1749       if (sws->fd == aws->fd) {
1750          /* For virtio we can't return kms_handle, because it's not a GEM handle,
1751           * but a resource ID. Instead, repurpose the deprecated type
1752           * amdgpu_bo_handle_type_kms_noimport to request a GEM handle.
1753           */
1754          if (aws->info.is_virtio)
1755             ac_drm_bo_export(aws->dev, bo->bo,
1756                              amdgpu_bo_handle_type_kms_noimport,
1757                              &whandle->handle);
1758          else
1759             whandle->handle = bo->kms_handle;
1760 
1761          if (bo->is_shared)
1762             return true;
1763 
1764          goto hash_table_set;
1765       }
1766 
1767       simple_mtx_lock(&aws->sws_list_lock);
1768       entry = _mesa_hash_table_search(sws->kms_handles, bo);
1769       simple_mtx_unlock(&aws->sws_list_lock);
1770       if (entry) {
1771          whandle->handle = (uintptr_t)entry->data;
1772          return true;
1773       }
1774       FALLTHROUGH;
1775    case WINSYS_HANDLE_TYPE_FD:
1776       type = amdgpu_bo_handle_type_dma_buf_fd;
1777       break;
1778    default:
1779       return false;
1780    }
1781 
1782    r = ac_drm_bo_export(aws->dev, bo->bo, type, &whandle->handle);
1783    if (r)
1784       return false;
1785 
1786 #if defined(DMA_BUF_SET_NAME_B)
1787    if (whandle->type == WINSYS_HANDLE_TYPE_FD &&
1788        !bo->is_shared) {
1789       char dmabufname[32];
1790       snprintf(dmabufname, 32, "%d-%s", getpid(), util_get_process_name());
1791       r = ioctl(whandle->handle, DMA_BUF_SET_NAME_B, (uint64_t)(uintptr_t)dmabufname);
1792    }
1793 #endif
1794 
1795    if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
1796       int dma_fd = whandle->handle;
1797 
1798       r = drmPrimeFDToHandle(sws->fd, dma_fd, &whandle->handle);
1799       close(dma_fd);
1800 
1801       if (r)
1802          return false;
1803 
1804       simple_mtx_lock(&aws->sws_list_lock);
1805       _mesa_hash_table_insert_pre_hashed(sws->kms_handles,
1806                                          bo->kms_handle, bo,
1807                                          (void*)(uintptr_t)whandle->handle);
1808       simple_mtx_unlock(&aws->sws_list_lock);
1809    }
1810 
1811  hash_table_set:
1812    simple_mtx_lock(&aws->bo_export_table_lock);
1813    _mesa_hash_table_insert(aws->bo_export_table, bo->bo.abo, bo);
1814    simple_mtx_unlock(&aws->bo_export_table_lock);
1815 
1816    bo->is_shared = true;
1817    return true;
1818 }
1819 
amdgpu_bo_from_ptr(struct radeon_winsys * rws,void * pointer,uint64_t size,enum radeon_bo_flag flags)1820 static struct pb_buffer_lean *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
1821 					    void *pointer, uint64_t size,
1822 					    enum radeon_bo_flag flags)
1823 {
1824     struct amdgpu_winsys *aws = amdgpu_winsys(rws);
1825     ac_drm_bo buf_handle;
1826     struct amdgpu_bo_real *bo;
1827     uint64_t va;
1828     amdgpu_va_handle va_handle;
1829     /* Avoid failure when the size is not page aligned */
1830     uint64_t aligned_size = align64(size, aws->info.gart_page_size);
1831 
1832     bo = CALLOC_STRUCT(amdgpu_bo_real);
1833     if (!bo)
1834         return NULL;
1835 
1836     if (ac_drm_create_bo_from_user_mem(aws->dev, pointer,
1837                                                     aligned_size, &buf_handle))
1838         goto error;
1839 
1840     if (ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general,
1841                                            aligned_size,
1842                                            amdgpu_get_optimal_alignment(aws, aligned_size,
1843                                                                         aws->info.gart_page_size),
1844                                            0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH))
1845         goto error_va_alloc;
1846 
1847     uint32_t kms_handle;
1848     ac_drm_bo_export(aws->dev, buf_handle, amdgpu_bo_handle_type_kms, &kms_handle);
1849 
1850     if (amdgpu_bo_va_op_common(aws, NULL, kms_handle, false, &bo->vm_timeline_point, 0,
1851                                aligned_size, va, AMDGPU_VM_PAGE_READABLE |
1852                                   AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
1853                                AMDGPU_VA_OP_MAP))
1854        goto error_va_map;
1855 
1856     /* Initialize it. */
1857     bo->is_user_ptr = true;
1858     pipe_reference_init(&bo->b.base.reference, 1);
1859     bo->b.base.placement = RADEON_DOMAIN_GTT;
1860     bo->b.base.alignment_log2 = 0;
1861     bo->b.base.size = size;
1862     bo->b.type = AMDGPU_BO_REAL;
1863     bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
1864     simple_mtx_init(&bo->map_lock, mtx_plain);
1865     bo->bo = buf_handle;
1866     bo->cpu_ptr = pointer;
1867     bo->va_handle = va_handle;
1868     bo->kms_handle = kms_handle;
1869 
1870     aws->allocated_gtt += aligned_size;
1871 
1872     amdgpu_add_buffer_to_global_list(aws, bo);
1873 
1874     return (struct pb_buffer_lean*)bo;
1875 
1876 error_va_map:
1877     ac_drm_va_range_free(va_handle);
1878 
1879 error_va_alloc:
1880     ac_drm_bo_free(aws->dev, buf_handle);
1881 
1882 error:
1883     FREE(bo);
1884     return NULL;
1885 }
1886 
amdgpu_bo_is_user_ptr(struct pb_buffer_lean * buf)1887 static bool amdgpu_bo_is_user_ptr(struct pb_buffer_lean *buf)
1888 {
1889    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
1890 
1891    return is_real_bo(bo) ? get_real_bo(bo)->is_user_ptr : false;
1892 }
1893 
amdgpu_bo_is_suballocated(struct pb_buffer_lean * buf)1894 static bool amdgpu_bo_is_suballocated(struct pb_buffer_lean *buf)
1895 {
1896    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
1897 
1898    return bo->type == AMDGPU_BO_SLAB_ENTRY;
1899 }
1900 
amdgpu_bo_get_va(struct pb_buffer_lean * buf)1901 uint64_t amdgpu_bo_get_va(struct pb_buffer_lean *buf)
1902 {
1903    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
1904 
1905    if (bo->type == AMDGPU_BO_SLAB_ENTRY) {
1906       struct amdgpu_bo_real_reusable_slab *slab_bo =
1907          (struct amdgpu_bo_real_reusable_slab *)get_slab_entry_real_bo(bo);
1908 
1909       return amdgpu_va_get_start_addr(slab_bo->b.b.va_handle) + get_slab_entry_offset(bo);
1910    } else if (bo->type == AMDGPU_BO_SPARSE) {
1911       return amdgpu_va_get_start_addr(get_sparse_bo(bo)->va_handle);
1912    } else {
1913       return amdgpu_va_get_start_addr(get_real_bo(bo)->va_handle);
1914    }
1915 }
1916 
amdgpu_buffer_destroy(struct radeon_winsys * rws,struct pb_buffer_lean * buf)1917 static void amdgpu_buffer_destroy(struct radeon_winsys *rws, struct pb_buffer_lean *buf)
1918 {
1919    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
1920 
1921    if (bo->type == AMDGPU_BO_SLAB_ENTRY)
1922       amdgpu_bo_slab_destroy(rws, buf);
1923    else if (bo->type == AMDGPU_BO_SPARSE)
1924       amdgpu_bo_sparse_destroy(rws, buf);
1925    else
1926       amdgpu_bo_destroy_or_cache(rws, buf);
1927 }
1928 
amdgpu_bo_init_functions(struct amdgpu_screen_winsys * sws)1929 void amdgpu_bo_init_functions(struct amdgpu_screen_winsys *sws)
1930 {
1931    sws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
1932    sws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
1933    sws->base.buffer_map = amdgpu_bo_map;
1934    sws->base.buffer_unmap = amdgpu_bo_unmap;
1935    sws->base.buffer_wait = amdgpu_bo_wait;
1936    sws->base.buffer_create = amdgpu_buffer_create;
1937    sws->base.buffer_destroy = amdgpu_buffer_destroy;
1938    sws->base.buffer_from_handle = amdgpu_bo_from_handle;
1939    sws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
1940    sws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
1941    sws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;
1942    sws->base.buffer_get_handle = amdgpu_bo_get_handle;
1943    sws->base.buffer_commit = amdgpu_bo_sparse_commit;
1944    sws->base.buffer_find_next_committed_memory = amdgpu_bo_find_next_committed_memory;
1945    sws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
1946    sws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
1947    sws->base.buffer_get_flags = amdgpu_bo_get_flags;
1948 }
1949