• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <stdlib.h>
25 #include <unistd.h>
26 #include <limits.h>
27 #include <assert.h>
28 #include <sys/mman.h>
29 
30 #include "anv_private.h"
31 
32 #include "common/intel_aux_map.h"
33 #include "util/anon_file.h"
34 #include "util/futex.h"
35 
36 #ifdef HAVE_VALGRIND
37 #define VG_NOACCESS_READ(__ptr) ({                       \
38    VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \
39    __typeof(*(__ptr)) __val = *(__ptr);                  \
40    VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\
41    __val;                                                \
42 })
43 #define VG_NOACCESS_WRITE(__ptr, __val) ({                  \
44    VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr)));  \
45    *(__ptr) = (__val);                                      \
46    VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));   \
47 })
48 #else
49 #define VG_NOACCESS_READ(__ptr) (*(__ptr))
50 #define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val))
51 #endif
52 
53 #ifndef MAP_POPULATE
54 #define MAP_POPULATE 0
55 #endif
56 
57 /* Design goals:
58  *
59  *  - Lock free (except when resizing underlying bos)
60  *
61  *  - Constant time allocation with typically only one atomic
62  *
63  *  - Multiple allocation sizes without fragmentation
64  *
65  *  - Can grow while keeping addresses and offset of contents stable
66  *
67  *  - All allocations within one bo so we can point one of the
68  *    STATE_BASE_ADDRESS pointers at it.
69  *
70  * The overall design is a two-level allocator: top level is a fixed size, big
71  * block (8k) allocator, which operates out of a bo.  Allocation is done by
72  * either pulling a block from the free list or growing the used range of the
73  * bo.  Growing the range may run out of space in the bo which we then need to
74  * grow.  Growing the bo is tricky in a multi-threaded, lockless environment:
75  * we need to keep all pointers and contents in the old map valid.  GEM bos in
76  * general can't grow, but we use a trick: we create a memfd and use ftruncate
77  * to grow it as necessary.  We mmap the new size and then create a gem bo for
78  * it using the new gem userptr ioctl.  Without heavy-handed locking around
79  * our allocation fast-path, there isn't really a way to munmap the old mmap,
80  * so we just keep it around until garbage collection time.  While the block
81  * allocator is lockless for normal operations, we block other threads trying
82  * to allocate while we're growing the map.  It shouldn't happen often, and
83  * growing is fast anyway.
84  *
85  * At the next level we can use various sub-allocators.  The state pool is a
86  * pool of smaller, fixed size objects, which operates much like the block
87  * pool.  It uses a free list for freeing objects, but when it runs out of
88  * space it just allocates a new block from the block pool.  This allocator is
89  * intended for longer lived state objects such as SURFACE_STATE and most
90  * other persistent state objects in the API.  We may need to track more info
91  * with these object and a pointer back to the CPU object (eg VkImage).  In
92  * those cases we just allocate a slightly bigger object and put the extra
93  * state after the GPU state object.
94  *
95  * The state stream allocator works similar to how the i965 DRI driver streams
96  * all its state.  Even with Vulkan, we need to emit transient state (whether
97  * surface state base or dynamic state base), and for that we can just get a
98  * block and fill it up.  These cases are local to a command buffer and the
99  * sub-allocator need not be thread safe.  The streaming allocator gets a new
100  * block when it runs out of space and chains them together so they can be
101  * easily freed.
102  */
103 
104 /* Allocations are always at least 64 byte aligned, so 1 is an invalid value.
105  * We use it to indicate the free list is empty. */
106 #define EMPTY UINT32_MAX
107 
108 /* On FreeBSD PAGE_SIZE is already defined in
109  * /usr/include/machine/param.h that is indirectly
110  * included here.
111  */
112 #ifndef PAGE_SIZE
113 #define PAGE_SIZE 4096
114 #endif
115 
116 struct anv_state_table_cleanup {
117    void *map;
118    size_t size;
119 };
120 
121 #define ANV_STATE_TABLE_CLEANUP_INIT ((struct anv_state_table_cleanup){0})
122 #define ANV_STATE_ENTRY_SIZE (sizeof(struct anv_free_entry))
123 
124 static VkResult
125 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size);
126 
127 VkResult
anv_state_table_init(struct anv_state_table * table,struct anv_device * device,uint32_t initial_entries)128 anv_state_table_init(struct anv_state_table *table,
129                     struct anv_device *device,
130                     uint32_t initial_entries)
131 {
132    VkResult result;
133 
134    table->device = device;
135 
136    /* Just make it 2GB up-front.  The Linux kernel won't actually back it
137     * with pages until we either map and fault on one of them or we use
138     * userptr and send a chunk of it off to the GPU.
139     */
140    table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table");
141    if (table->fd == -1)
142       return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
143 
144    if (!u_vector_init(&table->cleanups, 8,
145                       sizeof(struct anv_state_table_cleanup))) {
146       result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
147       goto fail_fd;
148    }
149 
150    table->state.next = 0;
151    table->state.end = 0;
152    table->size = 0;
153 
154    uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;
155    result = anv_state_table_expand_range(table, initial_size);
156    if (result != VK_SUCCESS)
157       goto fail_cleanups;
158 
159    return VK_SUCCESS;
160 
161  fail_cleanups:
162    u_vector_finish(&table->cleanups);
163  fail_fd:
164    close(table->fd);
165 
166    return result;
167 }
168 
169 static VkResult
anv_state_table_expand_range(struct anv_state_table * table,uint32_t size)170 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
171 {
172    void *map;
173    struct anv_state_table_cleanup *cleanup;
174 
175    /* Assert that we only ever grow the pool */
176    assert(size >= table->state.end);
177 
178    /* Make sure that we don't go outside the bounds of the memfd */
179    if (size > BLOCK_POOL_MEMFD_SIZE)
180       return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
181 
182    cleanup = u_vector_add(&table->cleanups);
183    if (!cleanup)
184       return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
185 
186    *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
187 
188    /* Just leak the old map until we destroy the pool.  We can't munmap it
189     * without races or imposing locking on the block allocate fast path. On
190     * the whole the leaked maps adds up to less than the size of the
191     * current map.  MAP_POPULATE seems like the right thing to do, but we
192     * should try to get some numbers.
193     */
194    map = mmap(NULL, size, PROT_READ | PROT_WRITE,
195               MAP_SHARED | MAP_POPULATE, table->fd, 0);
196    if (map == MAP_FAILED) {
197       return vk_errorf(table->device, VK_ERROR_OUT_OF_HOST_MEMORY,
198                        "mmap failed: %m");
199    }
200 
201    cleanup->map = map;
202    cleanup->size = size;
203 
204    table->map = map;
205    table->size = size;
206 
207    return VK_SUCCESS;
208 }
209 
210 static VkResult
anv_state_table_grow(struct anv_state_table * table)211 anv_state_table_grow(struct anv_state_table *table)
212 {
213    VkResult result = VK_SUCCESS;
214 
215    uint32_t used = align(table->state.next * ANV_STATE_ENTRY_SIZE,
216                          PAGE_SIZE);
217    uint32_t old_size = table->size;
218 
219    /* The block pool is always initialized to a nonzero size and this function
220     * is always called after initialization.
221     */
222    assert(old_size > 0);
223 
224    uint32_t required = MAX2(used, old_size);
225    if (used * 2 <= required) {
226       /* If we're in this case then this isn't the firsta allocation and we
227        * already have enough space on both sides to hold double what we
228        * have allocated.  There's nothing for us to do.
229        */
230       goto done;
231    }
232 
233    uint32_t size = old_size * 2;
234    while (size < required)
235       size *= 2;
236 
237    assert(size > table->size);
238 
239    result = anv_state_table_expand_range(table, size);
240 
241  done:
242    return result;
243 }
244 
245 void
anv_state_table_finish(struct anv_state_table * table)246 anv_state_table_finish(struct anv_state_table *table)
247 {
248    struct anv_state_table_cleanup *cleanup;
249 
250    u_vector_foreach(cleanup, &table->cleanups) {
251       if (cleanup->map)
252          munmap(cleanup->map, cleanup->size);
253    }
254 
255    u_vector_finish(&table->cleanups);
256 
257    close(table->fd);
258 }
259 
260 VkResult
anv_state_table_add(struct anv_state_table * table,uint32_t * idx,uint32_t count)261 anv_state_table_add(struct anv_state_table *table, uint32_t *idx,
262                     uint32_t count)
263 {
264    struct anv_block_state state, old, new;
265    VkResult result;
266 
267    assert(idx);
268 
269    while(1) {
270       state.u64 = __sync_fetch_and_add(&table->state.u64, count);
271       if (state.next + count <= state.end) {
272          assert(table->map);
273          struct anv_free_entry *entry = &table->map[state.next];
274          for (int i = 0; i < count; i++) {
275             entry[i].state.idx = state.next + i;
276          }
277          *idx = state.next;
278          return VK_SUCCESS;
279       } else if (state.next <= state.end) {
280          /* We allocated the first block outside the pool so we have to grow
281           * the pool.  pool_state->next acts a mutex: threads who try to
282           * allocate now will get block indexes above the current limit and
283           * hit futex_wait below.
284           */
285          new.next = state.next + count;
286          do {
287             result = anv_state_table_grow(table);
288             if (result != VK_SUCCESS)
289                return result;
290             new.end = table->size / ANV_STATE_ENTRY_SIZE;
291          } while (new.end < new.next);
292 
293          old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64);
294          if (old.next != state.next)
295             futex_wake(&table->state.end, INT_MAX);
296       } else {
297          futex_wait(&table->state.end, state.end, NULL);
298          continue;
299       }
300    }
301 }
302 
303 void
anv_free_list_push(union anv_free_list * list,struct anv_state_table * table,uint32_t first,uint32_t count)304 anv_free_list_push(union anv_free_list *list,
305                    struct anv_state_table *table,
306                    uint32_t first, uint32_t count)
307 {
308    union anv_free_list current, old, new;
309    uint32_t last = first;
310 
311    for (uint32_t i = 1; i < count; i++, last++)
312       table->map[last].next = last + 1;
313 
314    old.u64 = list->u64;
315    do {
316       current = old;
317       table->map[last].next = current.offset;
318       new.offset = first;
319       new.count = current.count + 1;
320       old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
321    } while (old.u64 != current.u64);
322 }
323 
324 struct anv_state *
anv_free_list_pop(union anv_free_list * list,struct anv_state_table * table)325 anv_free_list_pop(union anv_free_list *list,
326                   struct anv_state_table *table)
327 {
328    union anv_free_list current, new, old;
329 
330    current.u64 = list->u64;
331    while (current.offset != EMPTY) {
332       __sync_synchronize();
333       new.offset = table->map[current.offset].next;
334       new.count = current.count + 1;
335       old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
336       if (old.u64 == current.u64) {
337          struct anv_free_entry *entry = &table->map[current.offset];
338          return &entry->state;
339       }
340       current = old;
341    }
342 
343    return NULL;
344 }
345 
346 static VkResult
347 anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size);
348 
349 VkResult
anv_block_pool_init(struct anv_block_pool * pool,struct anv_device * device,const char * name,uint64_t start_address,uint32_t initial_size,uint32_t max_size)350 anv_block_pool_init(struct anv_block_pool *pool,
351                     struct anv_device *device,
352                     const char *name,
353                     uint64_t start_address,
354                     uint32_t initial_size,
355                     uint32_t max_size)
356 {
357    VkResult result;
358 
359    /* Make sure VMA addresses are aligned for the block pool */
360    assert(anv_is_aligned(start_address, device->info->mem_alignment));
361    assert(anv_is_aligned(initial_size, device->info->mem_alignment));
362    assert(max_size > 0);
363    assert(max_size > initial_size);
364 
365    pool->name = name;
366    pool->device = device;
367    pool->nbos = 0;
368    pool->size = 0;
369    pool->start_address = intel_canonical_address(start_address);
370    pool->max_size = max_size;
371 
372    pool->bo = NULL;
373 
374    pool->state.next = 0;
375    pool->state.end = 0;
376 
377    pool->bo_alloc_flags =
378       ANV_BO_ALLOC_FIXED_ADDRESS |
379       ANV_BO_ALLOC_MAPPED |
380       ANV_BO_ALLOC_HOST_CACHED_COHERENT |
381       ANV_BO_ALLOC_CAPTURE |
382       ANV_BO_ALLOC_INTERNAL;
383 
384    result = anv_block_pool_expand_range(pool, initial_size);
385    if (result != VK_SUCCESS)
386       return result;
387 
388    /* Make the entire pool available in the front of the pool.  If back
389     * allocation needs to use this space, the "ends" will be re-arranged.
390     */
391    pool->state.end = pool->size;
392 
393    return VK_SUCCESS;
394 }
395 
396 void
anv_block_pool_finish(struct anv_block_pool * pool)397 anv_block_pool_finish(struct anv_block_pool *pool)
398 {
399    anv_block_pool_foreach_bo(bo, pool) {
400       assert(bo->refcount == 1);
401       anv_device_release_bo(pool->device, bo);
402    }
403 }
404 
405 static VkResult
anv_block_pool_expand_range(struct anv_block_pool * pool,uint32_t size)406 anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size)
407 {
408    /* Assert that we only ever grow the pool */
409    assert(size >= pool->state.end);
410 
411    /* For state pool BOs we have to be a bit careful about where we place them
412     * in the GTT.  There are two documented workarounds for state base address
413     * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset
414     * which state that those two base addresses do not support 48-bit
415     * addresses and need to be placed in the bottom 32-bit range.
416     * Unfortunately, this is not quite accurate.
417     *
418     * The real problem is that we always set the size of our state pools in
419     * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most
420     * likely significantly smaller.  We do this because we do not no at the
421     * time we emit STATE_BASE_ADDRESS whether or not we will need to expand
422     * the pool during command buffer building so we don't actually have a
423     * valid final size.  If the address + size, as seen by STATE_BASE_ADDRESS
424     * overflows 48 bits, the GPU appears to treat all accesses to the buffer
425     * as being out of bounds and returns zero.  For dynamic state, this
426     * usually just leads to rendering corruptions, but shaders that are all
427     * zero hang the GPU immediately.
428     *
429     * The easiest solution to do is exactly what the bogus workarounds say to
430     * do: restrict these buffers to 32-bit addresses.  We could also pin the
431     * BO to some particular location of our choosing, but that's significantly
432     * more work than just not setting a flag.  So, we explicitly DO NOT set
433     * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the
434     * hard work for us.  When using softpin, we're in control and the fixed
435     * addresses we choose are fine for base addresses.
436     */
437 
438    uint32_t new_bo_size = size - pool->size;
439    struct anv_bo *new_bo = NULL;
440    VkResult result = anv_device_alloc_bo(pool->device,
441                                          pool->name,
442                                          new_bo_size,
443                                          pool->bo_alloc_flags,
444                                          intel_48b_address(pool->start_address + pool->size),
445                                          &new_bo);
446    if (result != VK_SUCCESS)
447       return result;
448 
449    pool->bos[pool->nbos++] = new_bo;
450 
451    /* This pointer will always point to the first BO in the list */
452    pool->bo = pool->bos[0];
453 
454    assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS);
455    pool->size = size;
456 
457    return VK_SUCCESS;
458 }
459 
460 /** Returns current memory map of the block pool.
461  *
462  * The returned pointer points to the map for the memory at the specified
463  * offset. The offset parameter is relative to the "center" of the block pool
464  * rather than the start of the block pool BO map.
465  */
466 void*
anv_block_pool_map(struct anv_block_pool * pool,int32_t offset,uint32_t size)467 anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
468 {
469    struct anv_bo *bo = NULL;
470    int32_t bo_offset = 0;
471    anv_block_pool_foreach_bo(iter_bo, pool) {
472       if (offset < bo_offset + iter_bo->size) {
473          bo = iter_bo;
474          break;
475       }
476       bo_offset += iter_bo->size;
477    }
478    assert(bo != NULL);
479    assert(offset >= bo_offset);
480    assert((offset - bo_offset) + size <= bo->size);
481 
482    return bo->map + (offset - bo_offset);
483 }
484 
485 /** Grows and re-centers the block pool.
486  *
487  * We grow the block pool in one or both directions in such a way that the
488  * following conditions are met:
489  *
490  *  1) The size of the entire pool is always a power of two.
491  *
492  *  2) The pool only grows on both ends.  Neither end can get
493  *     shortened.
494  *
495  *  3) At the end of the allocation, we have about twice as much space
496  *     allocated for each end as we have used.  This way the pool doesn't
497  *     grow too far in one direction or the other.
498  *
499  *  4) We have enough space allocated for at least one more block in
500  *     whichever side `state` points to.
501  *
502  *  5) The center of the pool is always aligned to both the block_size of
503  *     the pool and a 4K CPU page.
504  */
505 static uint32_t
anv_block_pool_grow(struct anv_block_pool * pool,struct anv_block_state * state,uint32_t contiguous_size)506 anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
507                     uint32_t contiguous_size)
508 {
509    VkResult result = VK_SUCCESS;
510 
511    pthread_mutex_lock(&pool->device->mutex);
512 
513    assert(state == &pool->state);
514 
515    /* Gather a little usage information on the pool.  Since we may have
516     * threads waiting in queue to get some storage while we resize, it's
517     * actually possible that total_used will be larger than old_size.  In
518     * particular, block_pool_alloc() increments state->next prior to
519     * calling block_pool_grow, so this ensures that we get enough space for
520     * which ever side tries to grow the pool.
521     *
522     * We align to a page size because it makes it easier to do our
523     * calculations later in such a way that we state page-aigned.
524     */
525    uint32_t total_used = align(pool->state.next, PAGE_SIZE);
526 
527    uint32_t old_size = pool->size;
528 
529    /* The block pool is always initialized to a nonzero size and this function
530     * is always called after initialization.
531     */
532    assert(old_size > 0);
533 
534    /* total_used may actually be smaller than the actual requirement because
535     * they are based on the next pointers which are updated prior to calling
536     * this function.
537     */
538    uint32_t required = MAX2(total_used, old_size);
539 
540    /* With softpin, the pool is made up of a bunch of buffers with separate
541     * maps.  Make sure we have enough contiguous space that we can get a
542     * properly contiguous map for the next chunk.
543     */
544    required = MAX2(required, old_size + contiguous_size);
545 
546    if (required > pool->max_size) {
547       result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
548    } else if (total_used * 2 > required) {
549       uint32_t size = old_size * 2;
550       while (size < required)
551          size *= 2;
552 
553       size = MIN2(size, pool->max_size);
554       assert(size > pool->size);
555 
556       result = anv_block_pool_expand_range(pool, size);
557    }
558 
559    pthread_mutex_unlock(&pool->device->mutex);
560 
561    if (result != VK_SUCCESS)
562       return 0;
563 
564    /* Return the appropriate new size.  This function never actually
565     * updates state->next.  Instead, we let the caller do that because it
566     * needs to do so in order to maintain its concurrency model.
567     */
568    return pool->size;
569 }
570 
571 static VkResult
anv_block_pool_alloc_new(struct anv_block_pool * pool,struct anv_block_state * pool_state,uint32_t block_size,int64_t * offset,uint32_t * padding)572 anv_block_pool_alloc_new(struct anv_block_pool *pool,
573                          struct anv_block_state *pool_state,
574                          uint32_t block_size,
575                          int64_t *offset,
576                          uint32_t *padding)
577 {
578    struct anv_block_state state, old, new;
579 
580    /* Most allocations won't generate any padding */
581    if (padding)
582       *padding = 0;
583 
584    while (1) {
585       state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);
586       if (state.next + block_size > pool->max_size) {
587          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
588       } else if (state.next + block_size <= state.end) {
589          *offset =  state.next;
590          return VK_SUCCESS;
591       } else if (state.next <= state.end) {
592          if (state.next < state.end) {
593             /* We need to grow the block pool, but still have some leftover
594              * space that can't be used by that particular allocation. So we
595              * add that as a "padding", and return it.
596              */
597             uint32_t leftover = state.end - state.next;
598 
599             /* If there is some leftover space in the pool, the caller must
600              * deal with it.
601              */
602             assert(leftover == 0 || padding);
603             if (padding)
604                *padding = leftover;
605             state.next += leftover;
606          }
607 
608          /* We allocated the first block outside the pool so we have to grow
609           * the pool.  pool_state->next acts a mutex: threads who try to
610           * allocate now will get block indexes above the current limit and
611           * hit futex_wait below.
612           */
613          new.next = state.next + block_size;
614          do {
615             new.end = anv_block_pool_grow(pool, pool_state, block_size);
616             if (pool->size > 0 && new.end == 0) {
617                futex_wake(&pool_state->end, INT_MAX);
618                return VK_ERROR_OUT_OF_DEVICE_MEMORY;
619             }
620          } while (new.end < new.next);
621 
622          old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
623          if (old.next != state.next)
624             futex_wake(&pool_state->end, INT_MAX);
625          *offset = state.next;
626          return VK_SUCCESS;
627       } else {
628          futex_wait(&pool_state->end, state.end, NULL);
629          continue;
630       }
631    }
632 }
633 
634 VkResult
anv_block_pool_alloc(struct anv_block_pool * pool,uint32_t block_size,int64_t * offset,uint32_t * padding)635 anv_block_pool_alloc(struct anv_block_pool *pool,
636                      uint32_t block_size,
637                      int64_t *offset, uint32_t *padding)
638 {
639    return anv_block_pool_alloc_new(pool, &pool->state, block_size, offset, padding);
640 }
641 
642 VkResult
anv_state_pool_init(struct anv_state_pool * pool,struct anv_device * device,const struct anv_state_pool_params * params)643 anv_state_pool_init(struct anv_state_pool *pool,
644                     struct anv_device *device,
645                     const struct anv_state_pool_params *params)
646 {
647    uint32_t initial_size = MAX2(params->block_size * 16,
648                                 device->info->mem_alignment);
649 
650    VkResult result = anv_block_pool_init(&pool->block_pool, device,
651                                          params->name,
652                                          params->base_address + params->start_offset,
653                                          initial_size,
654                                          params->max_size);
655    if (result != VK_SUCCESS)
656       return result;
657 
658    pool->start_offset = params->start_offset;
659 
660    result = anv_state_table_init(&pool->table, device, 64);
661    if (result != VK_SUCCESS) {
662       anv_block_pool_finish(&pool->block_pool);
663       return result;
664    }
665 
666    assert(util_is_power_of_two_or_zero(params->block_size));
667    pool->block_size = params->block_size;
668    for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
669       pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;
670       pool->buckets[i].block.next = 0;
671       pool->buckets[i].block.end = 0;
672    }
673    VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
674 
675    return VK_SUCCESS;
676 }
677 
678 void
anv_state_pool_finish(struct anv_state_pool * pool)679 anv_state_pool_finish(struct anv_state_pool *pool)
680 {
681    VG(VALGRIND_DESTROY_MEMPOOL(pool));
682    anv_state_table_finish(&pool->table);
683    anv_block_pool_finish(&pool->block_pool);
684 }
685 
686 static VkResult
anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool * pool,struct anv_block_pool * block_pool,uint32_t state_size,uint32_t block_size,int64_t * offset,uint32_t * padding)687 anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
688                                     struct anv_block_pool *block_pool,
689                                     uint32_t state_size,
690                                     uint32_t block_size,
691                                     int64_t *offset,
692                                     uint32_t *padding)
693 {
694    struct anv_block_state block, old, new;
695 
696    /* We don't always use anv_block_pool_alloc(), which would set *padding to
697     * zero for us. So if we have a pointer to padding, we must zero it out
698     * ourselves here, to make sure we always return some sensible value.
699     */
700    if (padding)
701       *padding = 0;
702 
703    /* If our state is large, we don't need any sub-allocation from a block.
704     * Instead, we just grab whole (potentially large) blocks.
705     */
706    if (state_size >= block_size)
707       return anv_block_pool_alloc(block_pool, state_size, offset, padding);
708 
709  restart:
710    block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);
711 
712    if (block.next < block.end) {
713       *offset = block.next;
714       return VK_SUCCESS;
715    } else if (block.next == block.end) {
716       VkResult result = anv_block_pool_alloc(block_pool, block_size,
717                                              offset, padding);
718       if (result != VK_SUCCESS)
719          return result;
720       new.next = *offset + state_size;
721       new.end = *offset + block_size;
722       old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
723       if (old.next != block.next)
724          futex_wake(&pool->block.end, INT_MAX);
725       return result;
726    } else {
727       futex_wait(&pool->block.end, block.end, NULL);
728       goto restart;
729    }
730 }
731 
732 static uint32_t
anv_state_pool_get_bucket(uint32_t size)733 anv_state_pool_get_bucket(uint32_t size)
734 {
735    unsigned size_log2 = util_logbase2_ceil(size);
736    assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
737    if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
738       size_log2 = ANV_MIN_STATE_SIZE_LOG2;
739    return size_log2 - ANV_MIN_STATE_SIZE_LOG2;
740 }
741 
742 static uint32_t
anv_state_pool_get_bucket_size(uint32_t bucket)743 anv_state_pool_get_bucket_size(uint32_t bucket)
744 {
745    uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2;
746    return 1 << size_log2;
747 }
748 
749 /** Helper to push a chunk into the state table.
750  *
751  * It creates 'count' entries into the state table and update their sizes,
752  * offsets and maps, also pushing them as "free" states.
753  */
754 static void
anv_state_pool_return_blocks(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t count,uint32_t block_size)755 anv_state_pool_return_blocks(struct anv_state_pool *pool,
756                              uint32_t chunk_offset, uint32_t count,
757                              uint32_t block_size)
758 {
759    /* Disallow returning 0 chunks */
760    assert(count != 0);
761 
762    /* Make sure we always return chunks aligned to the block_size */
763    assert(chunk_offset % block_size == 0);
764 
765    uint32_t st_idx;
766    UNUSED VkResult result = anv_state_table_add(&pool->table, &st_idx, count);
767    assert(result == VK_SUCCESS);
768    for (int i = 0; i < count; i++) {
769       /* update states that were added back to the state table */
770       struct anv_state *state_i = anv_state_table_get(&pool->table,
771                                                       st_idx + i);
772       state_i->alloc_size = block_size;
773       state_i->offset = pool->start_offset + chunk_offset + block_size * i;
774       state_i->map = anv_block_pool_map(&pool->block_pool,
775                                         state_i->offset,
776                                         state_i->alloc_size);
777    }
778 
779    uint32_t block_bucket = anv_state_pool_get_bucket(block_size);
780    anv_free_list_push(&pool->buckets[block_bucket].free_list,
781                       &pool->table, st_idx, count);
782 }
783 
784 /** Returns a chunk of memory back to the state pool.
785  *
786  * Do a two-level split. If chunk_size is bigger than divisor
787  * (pool->block_size), we return as many divisor sized blocks as we can, from
788  * the end of the chunk.
789  *
790  * The remaining is then split into smaller blocks (starting at small_size if
791  * it is non-zero), with larger blocks always being taken from the end of the
792  * chunk.
793  */
794 static void
anv_state_pool_return_chunk(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t chunk_size,uint32_t small_size)795 anv_state_pool_return_chunk(struct anv_state_pool *pool,
796                             uint32_t chunk_offset, uint32_t chunk_size,
797                             uint32_t small_size)
798 {
799    uint32_t divisor = pool->block_size;
800    uint32_t nblocks = chunk_size / divisor;
801    uint32_t rest = chunk_size - nblocks * divisor;
802 
803    if (nblocks > 0) {
804       /* First return divisor aligned and sized chunks. We start returning
805        * larger blocks from the end of the chunk, since they should already be
806        * aligned to divisor. Also anv_state_pool_return_blocks() only accepts
807        * aligned chunks.
808        */
809       uint32_t offset = chunk_offset + rest;
810       anv_state_pool_return_blocks(pool, offset, nblocks, divisor);
811    }
812 
813    chunk_size = rest;
814    divisor /= 2;
815 
816    if (small_size > 0 && small_size < divisor)
817       divisor = small_size;
818 
819    uint32_t min_size = 1 << ANV_MIN_STATE_SIZE_LOG2;
820 
821    /* Just as before, return larger divisor aligned blocks from the end of the
822     * chunk first.
823     */
824    while (chunk_size > 0 && divisor >= min_size) {
825       nblocks = chunk_size / divisor;
826       rest = chunk_size - nblocks * divisor;
827       if (nblocks > 0) {
828          anv_state_pool_return_blocks(pool, chunk_offset + rest,
829                                       nblocks, divisor);
830          chunk_size = rest;
831       }
832       divisor /= 2;
833    }
834 }
835 
836 static struct anv_state
anv_state_pool_alloc_no_vg(struct anv_state_pool * pool,uint32_t size,uint32_t align)837 anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
838                            uint32_t size, uint32_t align)
839 {
840    uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align));
841 
842    struct anv_state *state;
843    uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket);
844    int64_t offset;
845 
846    /* Try free list first. */
847    state = anv_free_list_pop(&pool->buckets[bucket].free_list,
848                              &pool->table);
849    if (state) {
850       assert(state->offset >= pool->start_offset);
851       goto done;
852    }
853 
854    /* Try to grab a chunk from some larger bucket and split it up */
855    for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) {
856       state = anv_free_list_pop(&pool->buckets[b].free_list, &pool->table);
857       if (state) {
858          unsigned chunk_size = anv_state_pool_get_bucket_size(b);
859          int32_t chunk_offset = state->offset;
860 
861          /* First lets update the state we got to its new size. offset and map
862           * remain the same.
863           */
864          state->alloc_size = alloc_size;
865 
866          /* Now return the unused part of the chunk back to the pool as free
867           * blocks
868           *
869           * There are a couple of options as to what we do with it:
870           *
871           *    1) We could fully split the chunk into state.alloc_size sized
872           *       pieces.  However, this would mean that allocating a 16B
873           *       state could potentially split a 2MB chunk into 512K smaller
874           *       chunks.  This would lead to unnecessary fragmentation.
875           *
876           *    2) The classic "buddy allocator" method would have us split the
877           *       chunk in half and return one half.  Then we would split the
878           *       remaining half in half and return one half, and repeat as
879           *       needed until we get down to the size we want.  However, if
880           *       you are allocating a bunch of the same size state (which is
881           *       the common case), this means that every other allocation has
882           *       to go up a level and every fourth goes up two levels, etc.
883           *       This is not nearly as efficient as it could be if we did a
884           *       little more work up-front.
885           *
886           *    3) Split the difference between (1) and (2) by doing a
887           *       two-level split.  If it's bigger than some fixed block_size,
888           *       we split it into block_size sized chunks and return all but
889           *       one of them.  Then we split what remains into
890           *       state.alloc_size sized chunks and return them.
891           *
892           * We choose something close to option (3), which is implemented with
893           * anv_state_pool_return_chunk(). That is done by returning the
894           * remaining of the chunk, with alloc_size as a hint of the size that
895           * we want the smaller chunk split into.
896           */
897          anv_state_pool_return_chunk(pool, chunk_offset + alloc_size,
898                                      chunk_size - alloc_size, alloc_size);
899          goto done;
900       }
901    }
902 
903    uint32_t padding;
904    VkResult result =
905       anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
906                                           &pool->block_pool,
907                                           alloc_size,
908                                           pool->block_size,
909                                           &offset,
910                                           &padding);
911    if (result != VK_SUCCESS)
912       return ANV_STATE_NULL;
913 
914    /* Every time we allocate a new state, add it to the state pool */
915    uint32_t idx = 0;
916    result = anv_state_table_add(&pool->table, &idx, 1);
917    assert(result == VK_SUCCESS);
918 
919    state = anv_state_table_get(&pool->table, idx);
920    state->offset = pool->start_offset + offset;
921    state->alloc_size = alloc_size;
922    state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
923 
924    if (padding > 0) {
925       uint32_t return_offset = offset - padding;
926       anv_state_pool_return_chunk(pool, return_offset, padding, 0);
927    }
928 
929 done:
930    return *state;
931 }
932 
933 struct anv_state
anv_state_pool_alloc(struct anv_state_pool * pool,uint32_t size,uint32_t align)934 anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)
935 {
936    if (size == 0)
937       return ANV_STATE_NULL;
938 
939    struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align);
940    VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size));
941    return state;
942 }
943 
944 static void
anv_state_pool_free_no_vg(struct anv_state_pool * pool,struct anv_state state)945 anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)
946 {
947    assert(util_is_power_of_two_or_zero(state.alloc_size));
948    unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);
949 
950    assert(state.offset >= pool->start_offset);
951 
952    anv_free_list_push(&pool->buckets[bucket].free_list,
953                       &pool->table, state.idx, 1);
954 }
955 
956 void
anv_state_pool_free(struct anv_state_pool * pool,struct anv_state state)957 anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)
958 {
959    if (state.alloc_size == 0)
960       return;
961 
962    VG(VALGRIND_MEMPOOL_FREE(pool, state.map));
963    anv_state_pool_free_no_vg(pool, state);
964 }
965 
966 struct anv_state_stream_block {
967    struct anv_state block;
968 
969    /* The next block */
970    struct anv_state_stream_block *next;
971 
972 #ifdef HAVE_VALGRIND
973    /* A pointer to the first user-allocated thing in this block.  This is
974     * what valgrind sees as the start of the block.
975     */
976    void *_vg_ptr;
977 #endif
978 };
979 
980 /* The state stream allocator is a one-shot, single threaded allocator for
981  * variable sized blocks.  We use it for allocating dynamic state.
982  */
983 void
anv_state_stream_init(struct anv_state_stream * stream,struct anv_state_pool * state_pool,uint32_t block_size)984 anv_state_stream_init(struct anv_state_stream *stream,
985                       struct anv_state_pool *state_pool,
986                       uint32_t block_size)
987 {
988    stream->state_pool = state_pool;
989    stream->block_size = block_size;
990 
991    stream->block = ANV_STATE_NULL;
992 
993    /* Ensure that next + whatever > block_size.  This way the first call to
994     * state_stream_alloc fetches a new block.
995     */
996    stream->next = block_size;
997 
998    stream->total_size = 0;
999    util_dynarray_init(&stream->all_blocks, NULL);
1000 
1001    VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
1002 }
1003 
1004 void
anv_state_stream_finish(struct anv_state_stream * stream)1005 anv_state_stream_finish(struct anv_state_stream *stream)
1006 {
1007    util_dynarray_foreach(&stream->all_blocks, struct anv_state, block) {
1008       VG(VALGRIND_MEMPOOL_FREE(stream, block->map));
1009       VG(VALGRIND_MAKE_MEM_NOACCESS(block->map, block->alloc_size));
1010       anv_state_pool_free_no_vg(stream->state_pool, *block);
1011    }
1012    util_dynarray_fini(&stream->all_blocks);
1013 
1014    VG(VALGRIND_DESTROY_MEMPOOL(stream));
1015 }
1016 
1017 struct anv_state
anv_state_stream_alloc(struct anv_state_stream * stream,uint32_t size,uint32_t alignment)1018 anv_state_stream_alloc(struct anv_state_stream *stream,
1019                        uint32_t size, uint32_t alignment)
1020 {
1021    if (size == 0)
1022       return ANV_STATE_NULL;
1023 
1024    assert(alignment <= PAGE_SIZE);
1025 
1026    uint32_t offset = align(stream->next, alignment);
1027    if (offset + size > stream->block.alloc_size) {
1028       uint32_t block_size = stream->block_size;
1029       if (block_size < size)
1030          block_size = util_next_power_of_two(size);
1031 
1032       stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,
1033                                                  block_size, PAGE_SIZE);
1034       util_dynarray_append(&stream->all_blocks,
1035                            struct anv_state, stream->block);
1036       VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size));
1037 
1038       /* Reset back to the start */
1039       stream->next = offset = 0;
1040       assert(offset + size <= stream->block.alloc_size);
1041       stream->total_size += block_size;
1042    }
1043    const bool new_block = stream->next == 0;
1044 
1045    struct anv_state state = stream->block;
1046    state.offset += offset;
1047    state.alloc_size = size;
1048    state.map += offset;
1049 
1050    stream->next = offset + size;
1051 
1052    if (new_block) {
1053       assert(state.map == stream->block.map);
1054       VG(VALGRIND_MEMPOOL_ALLOC(stream, state.map, size));
1055    } else {
1056       /* This only updates the mempool.  The newly allocated chunk is still
1057        * marked as NOACCESS. */
1058       VG(VALGRIND_MEMPOOL_CHANGE(stream, stream->block.map, stream->block.map,
1059                                  stream->next));
1060       /* Mark the newly allocated chunk as undefined */
1061       VG(VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size));
1062    }
1063 
1064    return state;
1065 }
1066 
1067 void
anv_state_reserved_pool_init(struct anv_state_reserved_pool * pool,struct anv_state_pool * parent,uint32_t count,uint32_t size,uint32_t alignment)1068 anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool,
1069                              struct anv_state_pool *parent,
1070                              uint32_t count, uint32_t size, uint32_t alignment)
1071 {
1072    pool->pool = parent;
1073    pool->reserved_blocks = ANV_FREE_LIST_EMPTY;
1074    pool->count = count;
1075 
1076    for (unsigned i = 0; i < count; i++) {
1077       struct anv_state state = anv_state_pool_alloc(pool->pool, size, alignment);
1078       anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1079    }
1080 }
1081 
1082 void
anv_state_reserved_pool_finish(struct anv_state_reserved_pool * pool)1083 anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool)
1084 {
1085    struct anv_state *state;
1086 
1087    while ((state = anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table))) {
1088       anv_state_pool_free(pool->pool, *state);
1089       pool->count--;
1090    }
1091    assert(pool->count == 0);
1092 }
1093 
1094 struct anv_state
anv_state_reserved_pool_alloc(struct anv_state_reserved_pool * pool)1095 anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool)
1096 {
1097    return *anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table);
1098 }
1099 
1100 void
anv_state_reserved_pool_free(struct anv_state_reserved_pool * pool,struct anv_state state)1101 anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
1102                              struct anv_state state)
1103 {
1104    anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1105 }
1106 
1107 void
anv_bo_pool_init(struct anv_bo_pool * pool,struct anv_device * device,const char * name,enum anv_bo_alloc_flags alloc_flags)1108 anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
1109                  const char *name, enum anv_bo_alloc_flags alloc_flags)
1110 {
1111    pool->name = name;
1112    pool->device = device;
1113    pool->bo_alloc_flags = alloc_flags;
1114 
1115    for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1116       util_sparse_array_free_list_init(&pool->free_list[i],
1117                                        &device->bo_cache.bo_map, 0,
1118                                        offsetof(struct anv_bo, free_index));
1119    }
1120 
1121    VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
1122 }
1123 
1124 void
anv_bo_pool_finish(struct anv_bo_pool * pool)1125 anv_bo_pool_finish(struct anv_bo_pool *pool)
1126 {
1127    for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1128       while (1) {
1129          struct anv_bo *bo =
1130             util_sparse_array_free_list_pop_elem(&pool->free_list[i]);
1131          if (bo == NULL)
1132             break;
1133 
1134          /* anv_device_release_bo is going to "free" it */
1135          VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1));
1136          anv_device_release_bo(pool->device, bo);
1137       }
1138    }
1139 
1140    VG(VALGRIND_DESTROY_MEMPOOL(pool));
1141 }
1142 
1143 VkResult
anv_bo_pool_alloc(struct anv_bo_pool * pool,uint32_t size,struct anv_bo ** bo_out)1144 anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
1145                   struct anv_bo **bo_out)
1146 {
1147    const unsigned size_log2 = size < 4096 ? 12 : util_logbase2_ceil(size);
1148    const unsigned pow2_size = 1 << size_log2;
1149    const unsigned bucket = size_log2 - 12;
1150    assert(bucket < ARRAY_SIZE(pool->free_list));
1151 
1152    struct anv_bo *bo =
1153       util_sparse_array_free_list_pop_elem(&pool->free_list[bucket]);
1154    if (bo != NULL) {
1155       VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1156       *bo_out = bo;
1157       return VK_SUCCESS;
1158    }
1159 
1160    VkResult result = anv_device_alloc_bo(pool->device,
1161                                          pool->name,
1162                                          pow2_size,
1163                                          pool->bo_alloc_flags,
1164                                          0 /* explicit_address */,
1165                                          &bo);
1166    if (result != VK_SUCCESS)
1167       return result;
1168 
1169    /* We want it to look like it came from this pool */
1170    VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
1171    VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1172 
1173    *bo_out = bo;
1174 
1175    return VK_SUCCESS;
1176 }
1177 
1178 void
anv_bo_pool_free(struct anv_bo_pool * pool,struct anv_bo * bo)1179 anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo)
1180 {
1181    VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));
1182 
1183    assert(util_is_power_of_two_or_zero(bo->size));
1184    const unsigned size_log2 = util_logbase2_ceil(bo->size);
1185    const unsigned bucket = size_log2 - 12;
1186    assert(bucket < ARRAY_SIZE(pool->free_list));
1187 
1188    assert(util_sparse_array_get(&pool->device->bo_cache.bo_map,
1189                                 bo->gem_handle) == bo);
1190    util_sparse_array_free_list_push(&pool->free_list[bucket],
1191                                     &bo->gem_handle, 1);
1192 }
1193 
1194 // Scratch pool
1195 
1196 void
anv_scratch_pool_init(struct anv_device * device,struct anv_scratch_pool * pool)1197 anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool)
1198 {
1199    memset(pool, 0, sizeof(*pool));
1200 }
1201 
1202 void
anv_scratch_pool_finish(struct anv_device * device,struct anv_scratch_pool * pool)1203 anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool)
1204 {
1205    for (unsigned s = 0; s < ARRAY_SIZE(pool->bos[0]); s++) {
1206       for (unsigned i = 0; i < 16; i++) {
1207          if (pool->bos[i][s] != NULL)
1208             anv_device_release_bo(device, pool->bos[i][s]);
1209       }
1210    }
1211 
1212    for (unsigned i = 0; i < 16; i++) {
1213       if (pool->surf_states[i].map != NULL) {
1214          anv_state_pool_free(&device->scratch_surface_state_pool,
1215                              pool->surf_states[i]);
1216       }
1217    }
1218 }
1219 
1220 struct anv_bo *
anv_scratch_pool_alloc(struct anv_device * device,struct anv_scratch_pool * pool,gl_shader_stage stage,unsigned per_thread_scratch)1221 anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
1222                        gl_shader_stage stage, unsigned per_thread_scratch)
1223 {
1224    if (per_thread_scratch == 0)
1225       return NULL;
1226 
1227    unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1228    assert(scratch_size_log2 < 16);
1229 
1230    assert(stage < ARRAY_SIZE(pool->bos));
1231 
1232    const struct intel_device_info *devinfo = device->info;
1233 
1234    /* On GFX version 12.5, scratch access changed to a surface-based model.
1235     * Instead of each shader type having its own layout based on IDs passed
1236     * from the relevant fixed-function unit, all scratch access is based on
1237     * thread IDs like it always has been for compute.
1238     */
1239    if (devinfo->verx10 >= 125)
1240       stage = MESA_SHADER_COMPUTE;
1241 
1242    struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]);
1243 
1244    if (bo != NULL)
1245       return bo;
1246 
1247    assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
1248    uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
1249 
1250    /* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they
1251     * are still relative to the general state base address.  When we emit
1252     * STATE_BASE_ADDRESS, we set general state base address to 0 and the size
1253     * to the maximum (1 page under 4GB).  This allows us to just place the
1254     * scratch buffers anywhere we wish in the bottom 32 bits of address space
1255     * and just set the scratch base pointer in 3DSTATE_*S using a relocation.
1256     * However, in order to do so, we need to ensure that the kernel does not
1257     * place the scratch BO above the 32-bit boundary.
1258     *
1259     * NOTE: Technically, it can't go "anywhere" because the top page is off
1260     * limits.  However, when EXEC_OBJECT_SUPPORTS_48B_ADDRESS is set, the
1261     * kernel allocates space using
1262     *
1263     *    end = min_t(u64, end, (1ULL << 32) - I915_GTT_PAGE_SIZE);
1264     *
1265     * so nothing will ever touch the top page.
1266     */
1267    const enum anv_bo_alloc_flags alloc_flags =
1268       ANV_BO_ALLOC_INTERNAL |
1269       (devinfo->verx10 < 125 ? ANV_BO_ALLOC_32BIT_ADDRESS : 0);
1270    VkResult result = anv_device_alloc_bo(device, "scratch", size,
1271                                          alloc_flags,
1272                                          0 /* explicit_address */,
1273                                          &bo);
1274    if (result != VK_SUCCESS)
1275       return NULL; /* TODO */
1276 
1277    struct anv_bo *current_bo =
1278       p_atomic_cmpxchg(&pool->bos[scratch_size_log2][stage], NULL, bo);
1279    if (current_bo) {
1280       anv_device_release_bo(device, bo);
1281       return current_bo;
1282    } else {
1283       return bo;
1284    }
1285 }
1286 
1287 uint32_t
anv_scratch_pool_get_surf(struct anv_device * device,struct anv_scratch_pool * pool,unsigned per_thread_scratch)1288 anv_scratch_pool_get_surf(struct anv_device *device,
1289                           struct anv_scratch_pool *pool,
1290                           unsigned per_thread_scratch)
1291 {
1292    assert(device->info->verx10 >= 125);
1293 
1294    if (per_thread_scratch == 0)
1295       return 0;
1296 
1297    unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1298    assert(scratch_size_log2 < 16);
1299 
1300    uint32_t surf = p_atomic_read(&pool->surfs[scratch_size_log2]);
1301    if (surf > 0)
1302       return surf;
1303 
1304    struct anv_bo *bo =
1305       anv_scratch_pool_alloc(device, pool, MESA_SHADER_COMPUTE,
1306                              per_thread_scratch);
1307    struct anv_address addr = { .bo = bo };
1308 
1309    struct anv_state state =
1310       anv_state_pool_alloc(&device->scratch_surface_state_pool,
1311                            device->isl_dev.ss.size, 64);
1312 
1313    isl_buffer_fill_state(&device->isl_dev, state.map,
1314                          .address = anv_address_physical(addr),
1315                          .size_B = bo->size,
1316                          .mocs = anv_mocs(device, bo, 0),
1317                          .format = ISL_FORMAT_RAW,
1318                          .swizzle = ISL_SWIZZLE_IDENTITY,
1319                          .stride_B = per_thread_scratch,
1320                          .is_scratch = true);
1321 
1322    uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
1323                                        0, state.offset);
1324    if (current) {
1325       anv_state_pool_free(&device->scratch_surface_state_pool, state);
1326       return current;
1327    } else {
1328       pool->surf_states[scratch_size_log2] = state;
1329       return state.offset;
1330    }
1331 }
1332 
1333 VkResult
anv_bo_cache_init(struct anv_bo_cache * cache,struct anv_device * device)1334 anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device)
1335 {
1336    util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024);
1337 
1338    if (pthread_mutex_init(&cache->mutex, NULL)) {
1339       util_sparse_array_finish(&cache->bo_map);
1340       return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
1341                        "pthread_mutex_init failed: %m");
1342    }
1343 
1344    return VK_SUCCESS;
1345 }
1346 
1347 void
anv_bo_cache_finish(struct anv_bo_cache * cache)1348 anv_bo_cache_finish(struct anv_bo_cache *cache)
1349 {
1350    util_sparse_array_finish(&cache->bo_map);
1351    pthread_mutex_destroy(&cache->mutex);
1352 }
1353 
1354 static void
anv_bo_unmap_close(struct anv_device * device,struct anv_bo * bo)1355 anv_bo_unmap_close(struct anv_device *device, struct anv_bo *bo)
1356 {
1357    if (bo->map && !bo->from_host_ptr)
1358       anv_device_unmap_bo(device, bo, bo->map, bo->size);
1359 
1360    assert(bo->gem_handle != 0);
1361    device->kmd_backend->gem_close(device, bo);
1362 }
1363 
1364 static void
anv_bo_vma_free(struct anv_device * device,struct anv_bo * bo)1365 anv_bo_vma_free(struct anv_device *device, struct anv_bo *bo)
1366 {
1367    if (bo->offset != 0 && !(bo->alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS)) {
1368       assert(bo->vma_heap != NULL);
1369       anv_vma_free(device, bo->vma_heap, bo->offset, bo->size);
1370    }
1371    bo->vma_heap = NULL;
1372 }
1373 
1374 static void
anv_bo_finish(struct anv_device * device,struct anv_bo * bo)1375 anv_bo_finish(struct anv_device *device, struct anv_bo *bo)
1376 {
1377    /* Not releasing vma in case unbind fails */
1378    if (device->kmd_backend->vm_unbind_bo(device, bo) == 0)
1379       anv_bo_vma_free(device, bo);
1380 
1381    anv_bo_unmap_close(device, bo);
1382 }
1383 
1384 static VkResult
anv_bo_vma_alloc_or_close(struct anv_device * device,struct anv_bo * bo,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address)1385 anv_bo_vma_alloc_or_close(struct anv_device *device,
1386                           struct anv_bo *bo,
1387                           enum anv_bo_alloc_flags alloc_flags,
1388                           uint64_t explicit_address)
1389 {
1390    assert(bo->vma_heap == NULL);
1391    assert(explicit_address == intel_48b_address(explicit_address));
1392 
1393    uint32_t align = device->physical->info.mem_alignment;
1394 
1395    /* If it's big enough to store a tiled resource, we need 64K alignment */
1396    if (bo->size >= 64 * 1024)
1397       align = MAX2(64 * 1024, align);
1398 
1399    /* If we're using the AUX map, make sure we follow the required
1400     * alignment.
1401     */
1402    if (alloc_flags & ANV_BO_ALLOC_AUX_TT_ALIGNED)
1403       align = MAX2(intel_aux_map_get_alignment(device->aux_map_ctx), align);
1404 
1405    /* Opportunistically align addresses to 2Mb when above 1Mb. We do this
1406     * because this gives an opportunity for the kernel to use Transparent Huge
1407     * Pages (the 2MB page table layout) for faster memory access.
1408     *
1409     * Only available on ICL+.
1410     */
1411    if (device->info->ver >= 11 && bo->size >= 1 * 1024 * 1024)
1412       align = MAX2(2 * 1024 * 1024, align);
1413 
1414    if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
1415       bo->offset = intel_canonical_address(explicit_address);
1416    } else {
1417       bo->offset = anv_vma_alloc(device, bo->size, align, alloc_flags,
1418                                  explicit_address, &bo->vma_heap);
1419       if (bo->offset == 0) {
1420          anv_bo_unmap_close(device, bo);
1421          return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
1422                           "failed to allocate virtual address for BO");
1423       }
1424    }
1425 
1426    return VK_SUCCESS;
1427 }
1428 
1429 enum intel_device_info_mmap_mode
anv_bo_get_mmap_mode(struct anv_device * device,struct anv_bo * bo)1430 anv_bo_get_mmap_mode(struct anv_device *device, struct anv_bo *bo)
1431 {
1432    enum anv_bo_alloc_flags alloc_flags = bo->alloc_flags;
1433 
1434    if (device->info->has_set_pat_uapi)
1435       return anv_device_get_pat_entry(device, alloc_flags)->mmap;
1436 
1437    if (anv_physical_device_has_vram(device->physical)) {
1438       if ((alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) ||
1439           (alloc_flags & ANV_BO_ALLOC_IMPORTED))
1440          return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1441 
1442       return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1443    }
1444 
1445    /* gfx9 atom */
1446    if (!device->info->has_llc) {
1447       /* user wants a cached and coherent memory but to achieve it without
1448        * LLC in older platforms DRM_IOCTL_I915_GEM_SET_CACHING needs to be
1449        * supported and set.
1450        */
1451       if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED)
1452          return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1453 
1454       return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1455    }
1456 
1457    if (alloc_flags & (ANV_BO_ALLOC_SCANOUT | ANV_BO_ALLOC_EXTERNAL))
1458       return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1459 
1460    return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1461 }
1462 
1463 VkResult
anv_device_alloc_bo(struct anv_device * device,const char * name,uint64_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address,struct anv_bo ** bo_out)1464 anv_device_alloc_bo(struct anv_device *device,
1465                     const char *name,
1466                     uint64_t size,
1467                     enum anv_bo_alloc_flags alloc_flags,
1468                     uint64_t explicit_address,
1469                     struct anv_bo **bo_out)
1470 {
1471    /* bo that needs CPU access needs to be HOST_CACHED, HOST_COHERENT or both */
1472    assert((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0 ||
1473           (alloc_flags & (ANV_BO_ALLOC_HOST_CACHED | ANV_BO_ALLOC_HOST_COHERENT)));
1474 
1475    /* KMD requires a valid PAT index, so setting HOST_COHERENT/WC to bos that
1476     * don't need CPU access
1477     */
1478    if ((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0)
1479       alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
1480 
1481    /* In platforms with LLC we can promote all bos to cached+coherent for free */
1482    const enum anv_bo_alloc_flags not_allowed_promotion = ANV_BO_ALLOC_SCANOUT |
1483                                                          ANV_BO_ALLOC_EXTERNAL |
1484                                                          ANV_BO_ALLOC_PROTECTED;
1485    if (device->info->has_llc && ((alloc_flags & not_allowed_promotion) == 0))
1486       alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
1487 
1488    const uint32_t bo_flags =
1489          device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
1490 
1491    /* The kernel is going to give us whole pages anyway. */
1492    size = align64(size, 4096);
1493 
1494    const uint64_t ccs_offset = size;
1495    if (alloc_flags & ANV_BO_ALLOC_AUX_CCS) {
1496       assert(device->info->has_aux_map);
1497       size += DIV_ROUND_UP(size, intel_aux_get_main_to_aux_ratio(device->aux_map_ctx));
1498       size = align64(size, 4096);
1499    }
1500 
1501    const struct intel_memory_class_instance *regions[2];
1502    uint32_t nregions = 0;
1503 
1504    /* If we have vram size, we have multiple memory regions and should choose
1505     * one of them.
1506     */
1507    if (anv_physical_device_has_vram(device->physical)) {
1508       /* This always try to put the object in local memory. Here
1509        * vram_non_mappable & vram_mappable actually are the same region.
1510        */
1511       if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM)
1512          regions[nregions++] = device->physical->sys.region;
1513       else
1514          regions[nregions++] = device->physical->vram_non_mappable.region;
1515 
1516       /* If the buffer is mapped on the host, add the system memory region.
1517        * This ensures that if the buffer cannot live in mappable local memory,
1518        * it can be spilled to system memory.
1519        */
1520       if (!(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) &&
1521           ((alloc_flags & ANV_BO_ALLOC_MAPPED) ||
1522            (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)))
1523          regions[nregions++] = device->physical->sys.region;
1524    } else {
1525       regions[nregions++] = device->physical->sys.region;
1526    }
1527 
1528    uint64_t actual_size;
1529    uint32_t gem_handle = device->kmd_backend->gem_create(device, regions,
1530                                                          nregions, size,
1531                                                          alloc_flags,
1532                                                          &actual_size);
1533    if (gem_handle == 0)
1534       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1535 
1536    struct anv_bo new_bo = {
1537       .name = name,
1538       .gem_handle = gem_handle,
1539       .refcount = 1,
1540       .offset = -1,
1541       .size = size,
1542       .ccs_offset = ccs_offset,
1543       .actual_size = actual_size,
1544       .flags = bo_flags,
1545       .alloc_flags = alloc_flags,
1546    };
1547 
1548    if (alloc_flags & ANV_BO_ALLOC_MAPPED) {
1549       VkResult result = anv_device_map_bo(device, &new_bo, 0, size, &new_bo.map);
1550       if (unlikely(result != VK_SUCCESS)) {
1551          device->kmd_backend->gem_close(device, &new_bo);
1552          return result;
1553       }
1554    }
1555 
1556    VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1557                                                alloc_flags,
1558                                                explicit_address);
1559    if (result != VK_SUCCESS)
1560       return result;
1561 
1562    if (device->kmd_backend->vm_bind_bo(device, &new_bo)) {
1563       anv_bo_vma_free(device, &new_bo);
1564       anv_bo_unmap_close(device, &new_bo);
1565       return vk_errorf(device, VK_ERROR_UNKNOWN, "vm bind failed");
1566    }
1567 
1568    assert(new_bo.gem_handle);
1569 
1570    /* If we just got this gem_handle from anv_bo_init_new then we know no one
1571     * else is touching this BO at the moment so we don't need to lock here.
1572     */
1573    struct anv_bo *bo = anv_device_lookup_bo(device, new_bo.gem_handle);
1574    *bo = new_bo;
1575 
1576    *bo_out = bo;
1577 
1578    ANV_RMV(bo_allocate, device, bo);
1579 
1580    return VK_SUCCESS;
1581 }
1582 
1583 VkResult
anv_device_map_bo(struct anv_device * device,struct anv_bo * bo,uint64_t offset,size_t size,void ** map_out)1584 anv_device_map_bo(struct anv_device *device,
1585                   struct anv_bo *bo,
1586                   uint64_t offset,
1587                   size_t size,
1588                   void **map_out)
1589 {
1590    assert(!bo->from_host_ptr);
1591    assert(size > 0);
1592 
1593    void *map = anv_gem_mmap(device, bo, offset, size);
1594    if (unlikely(map == MAP_FAILED))
1595       return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
1596 
1597    assert(map != NULL);
1598 
1599    if (map_out)
1600       *map_out = map;
1601 
1602    return VK_SUCCESS;
1603 }
1604 
1605 void
anv_device_unmap_bo(struct anv_device * device,struct anv_bo * bo,void * map,size_t map_size)1606 anv_device_unmap_bo(struct anv_device *device,
1607                     struct anv_bo *bo,
1608                     void *map, size_t map_size)
1609 {
1610    assert(!bo->from_host_ptr);
1611 
1612    anv_gem_munmap(device, map, map_size);
1613 }
1614 
1615 VkResult
anv_device_import_bo_from_host_ptr(struct anv_device * device,void * host_ptr,uint32_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1616 anv_device_import_bo_from_host_ptr(struct anv_device *device,
1617                                    void *host_ptr, uint32_t size,
1618                                    enum anv_bo_alloc_flags alloc_flags,
1619                                    uint64_t client_address,
1620                                    struct anv_bo **bo_out)
1621 {
1622    assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1623                            ANV_BO_ALLOC_HOST_CACHED |
1624                            ANV_BO_ALLOC_HOST_COHERENT |
1625                            ANV_BO_ALLOC_AUX_CCS |
1626                            ANV_BO_ALLOC_PROTECTED |
1627                            ANV_BO_ALLOC_FIXED_ADDRESS)));
1628    assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
1629 
1630    struct anv_bo_cache *cache = &device->bo_cache;
1631    const uint32_t bo_flags =
1632          device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
1633 
1634    uint32_t gem_handle = device->kmd_backend->gem_create_userptr(device, host_ptr, size);
1635    if (!gem_handle)
1636       return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1637 
1638    pthread_mutex_lock(&cache->mutex);
1639 
1640    struct anv_bo *bo = NULL;
1641    if (device->info->kmd_type == INTEL_KMD_TYPE_XE) {
1642       bo = vk_zalloc(&device->vk.alloc, sizeof(*bo), 8,
1643                      VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1644       if (!bo) {
1645          pthread_mutex_unlock(&cache->mutex);
1646          return VK_ERROR_OUT_OF_HOST_MEMORY;
1647       }
1648    } else {
1649       bo = anv_device_lookup_bo(device, gem_handle);
1650    }
1651 
1652    if (bo->refcount > 0) {
1653       /* VK_EXT_external_memory_host doesn't require handling importing the
1654        * same pointer twice at the same time, but we don't get in the way.  If
1655        * kernel gives us the same gem_handle, only succeed if the flags match.
1656        */
1657       assert(bo->gem_handle == gem_handle);
1658       if (bo_flags != bo->flags) {
1659          pthread_mutex_unlock(&cache->mutex);
1660          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1661                           "same host pointer imported two different ways");
1662       }
1663 
1664       if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
1665           (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
1666          pthread_mutex_unlock(&cache->mutex);
1667          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1668                           "The same BO was imported with and without buffer "
1669                           "device address");
1670       }
1671 
1672       if (client_address && client_address != intel_48b_address(bo->offset)) {
1673          pthread_mutex_unlock(&cache->mutex);
1674          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1675                           "The same BO was imported at two different "
1676                           "addresses");
1677       }
1678 
1679       __sync_fetch_and_add(&bo->refcount, 1);
1680    } else {
1681       alloc_flags |= ANV_BO_ALLOC_IMPORTED;
1682       struct anv_bo new_bo = {
1683          .name = "host-ptr",
1684          .gem_handle = gem_handle,
1685          .refcount = 1,
1686          .offset = -1,
1687          .size = size,
1688          .actual_size = size,
1689          .map = host_ptr,
1690          .flags = bo_flags,
1691          .alloc_flags = alloc_flags,
1692          .from_host_ptr = true,
1693       };
1694 
1695       VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1696                                                   alloc_flags,
1697                                                   client_address);
1698       if (result != VK_SUCCESS) {
1699          pthread_mutex_unlock(&cache->mutex);
1700          return result;
1701       }
1702 
1703       if (device->kmd_backend->vm_bind_bo(device, &new_bo)) {
1704          VkResult res = vk_errorf(device, VK_ERROR_UNKNOWN, "vm bind failed: %m");
1705          anv_bo_vma_free(device, &new_bo);
1706          pthread_mutex_unlock(&cache->mutex);
1707          return res;
1708       }
1709 
1710       *bo = new_bo;
1711 
1712       ANV_RMV(bo_allocate, device, bo);
1713    }
1714 
1715    pthread_mutex_unlock(&cache->mutex);
1716    *bo_out = bo;
1717 
1718    return VK_SUCCESS;
1719 }
1720 
1721 VkResult
anv_device_import_bo(struct anv_device * device,int fd,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1722 anv_device_import_bo(struct anv_device *device,
1723                      int fd,
1724                      enum anv_bo_alloc_flags alloc_flags,
1725                      uint64_t client_address,
1726                      struct anv_bo **bo_out)
1727 {
1728    assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1729                            ANV_BO_ALLOC_HOST_CACHED |
1730                            ANV_BO_ALLOC_HOST_COHERENT |
1731                            ANV_BO_ALLOC_FIXED_ADDRESS)));
1732    assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
1733 
1734    struct anv_bo_cache *cache = &device->bo_cache;
1735 
1736    pthread_mutex_lock(&cache->mutex);
1737 
1738    uint32_t gem_handle = anv_gem_fd_to_handle(device, fd);
1739    if (!gem_handle) {
1740       pthread_mutex_unlock(&cache->mutex);
1741       return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1742    }
1743 
1744    struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
1745 
1746    uint32_t bo_flags;
1747    VkResult result = anv_gem_import_bo_alloc_flags_to_bo_flags(device, bo,
1748                                                                alloc_flags,
1749                                                                &bo_flags);
1750    if (result != VK_SUCCESS) {
1751       pthread_mutex_unlock(&cache->mutex);
1752       return result;
1753    }
1754 
1755    if (bo->refcount > 0) {
1756       if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
1757           (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
1758          pthread_mutex_unlock(&cache->mutex);
1759          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1760                           "The same BO was imported with and without buffer "
1761                           "device address");
1762       }
1763 
1764       if (client_address && client_address != intel_48b_address(bo->offset)) {
1765          pthread_mutex_unlock(&cache->mutex);
1766          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1767                           "The same BO was imported at two different "
1768                           "addresses");
1769       }
1770 
1771       __sync_fetch_and_add(&bo->refcount, 1);
1772    } else {
1773       alloc_flags |= ANV_BO_ALLOC_IMPORTED;
1774       struct anv_bo new_bo = {
1775          .name = "imported",
1776          .gem_handle = gem_handle,
1777          .refcount = 1,
1778          .offset = -1,
1779          .alloc_flags = alloc_flags,
1780       };
1781 
1782       off_t size = lseek(fd, 0, SEEK_END);
1783       if (size == (off_t)-1) {
1784          device->kmd_backend->gem_close(device, &new_bo);
1785          pthread_mutex_unlock(&cache->mutex);
1786          return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1787       }
1788       new_bo.size = size;
1789       new_bo.actual_size = size;
1790 
1791       VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1792                                                   alloc_flags,
1793                                                   client_address);
1794       if (result != VK_SUCCESS) {
1795          pthread_mutex_unlock(&cache->mutex);
1796          return result;
1797       }
1798 
1799       if (device->kmd_backend->vm_bind_bo(device, &new_bo)) {
1800          anv_bo_vma_free(device, &new_bo);
1801          pthread_mutex_unlock(&cache->mutex);
1802          return vk_errorf(device, VK_ERROR_UNKNOWN, "vm bind failed");
1803       }
1804 
1805       *bo = new_bo;
1806 
1807       ANV_RMV(bo_allocate, device, bo);
1808    }
1809 
1810    bo->flags = bo_flags;
1811 
1812    pthread_mutex_unlock(&cache->mutex);
1813    *bo_out = bo;
1814 
1815    return VK_SUCCESS;
1816 }
1817 
1818 VkResult
anv_device_export_bo(struct anv_device * device,struct anv_bo * bo,int * fd_out)1819 anv_device_export_bo(struct anv_device *device,
1820                      struct anv_bo *bo, int *fd_out)
1821 {
1822    assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
1823 
1824    /* This BO must have been flagged external in order for us to be able
1825     * to export it.  This is done based on external options passed into
1826     * anv_AllocateMemory.
1827     */
1828    assert(anv_bo_is_external(bo));
1829 
1830    int fd = anv_gem_handle_to_fd(device, bo->gem_handle);
1831    if (fd < 0)
1832       return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
1833 
1834    *fd_out = fd;
1835 
1836    return VK_SUCCESS;
1837 }
1838 
1839 VkResult
anv_device_get_bo_tiling(struct anv_device * device,struct anv_bo * bo,enum isl_tiling * tiling_out)1840 anv_device_get_bo_tiling(struct anv_device *device,
1841                          struct anv_bo *bo,
1842                          enum isl_tiling *tiling_out)
1843 {
1844    int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
1845    if (i915_tiling < 0) {
1846       return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1847                        "failed to get BO tiling: %m");
1848    }
1849 
1850    *tiling_out = isl_tiling_from_i915_tiling(i915_tiling);
1851 
1852    return VK_SUCCESS;
1853 }
1854 
1855 VkResult
anv_device_set_bo_tiling(struct anv_device * device,struct anv_bo * bo,uint32_t row_pitch_B,enum isl_tiling tiling)1856 anv_device_set_bo_tiling(struct anv_device *device,
1857                          struct anv_bo *bo,
1858                          uint32_t row_pitch_B,
1859                          enum isl_tiling tiling)
1860 {
1861    int ret = anv_gem_set_tiling(device, bo->gem_handle, row_pitch_B,
1862                                 isl_tiling_to_i915_tiling(tiling));
1863    if (ret) {
1864       return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
1865                        "failed to set BO tiling: %m");
1866    }
1867 
1868    return VK_SUCCESS;
1869 }
1870 
1871 static bool
atomic_dec_not_one(uint32_t * counter)1872 atomic_dec_not_one(uint32_t *counter)
1873 {
1874    uint32_t old, val;
1875 
1876    val = *counter;
1877    while (1) {
1878       if (val == 1)
1879          return false;
1880 
1881       old = __sync_val_compare_and_swap(counter, val, val - 1);
1882       if (old == val)
1883          return true;
1884 
1885       val = old;
1886    }
1887 }
1888 
1889 void
anv_device_release_bo(struct anv_device * device,struct anv_bo * bo)1890 anv_device_release_bo(struct anv_device *device,
1891                       struct anv_bo *bo)
1892 {
1893    struct anv_bo_cache *cache = &device->bo_cache;
1894    const bool bo_is_xe_userptr = device->info->kmd_type == INTEL_KMD_TYPE_XE &&
1895                                  bo->from_host_ptr;
1896    assert(bo_is_xe_userptr ||
1897           anv_device_lookup_bo(device, bo->gem_handle) == bo);
1898 
1899    /* Try to decrement the counter but don't go below one.  If this succeeds
1900     * then the refcount has been decremented and we are not the last
1901     * reference.
1902     */
1903    if (atomic_dec_not_one(&bo->refcount))
1904       return;
1905 
1906    ANV_RMV(bo_destroy, device, bo);
1907 
1908    pthread_mutex_lock(&cache->mutex);
1909 
1910    /* We are probably the last reference since our attempt to decrement above
1911     * failed.  However, we can't actually know until we are inside the mutex.
1912     * Otherwise, someone could import the BO between the decrement and our
1913     * taking the mutex.
1914     */
1915    if (unlikely(__sync_sub_and_fetch(&bo->refcount, 1) > 0)) {
1916       /* Turns out we're not the last reference.  Unlock and bail. */
1917       pthread_mutex_unlock(&cache->mutex);
1918       return;
1919    }
1920    assert(bo->refcount == 0);
1921 
1922    /* Memset the BO just in case.  The refcount being zero should be enough to
1923     * prevent someone from assuming the data is valid but it's safer to just
1924     * stomp to zero just in case.  We explicitly do this *before* we actually
1925     * close the GEM handle to ensure that if anyone allocates something and
1926     * gets the same GEM handle, the memset has already happen and won't stomp
1927     * all over any data they may write in this BO.
1928     */
1929    struct anv_bo old_bo = *bo;
1930 
1931    if (bo_is_xe_userptr)
1932       vk_free(&device->vk.alloc, bo);
1933    else
1934       memset(bo, 0, sizeof(*bo));
1935 
1936    anv_bo_finish(device, &old_bo);
1937 
1938    /* Don't unlock until we've actually closed the BO.  The whole point of
1939     * the BO cache is to ensure that we correctly handle races with creating
1940     * and releasing GEM handles and we don't want to let someone import the BO
1941     * again between mutex unlock and closing the GEM handle.
1942     */
1943    pthread_mutex_unlock(&cache->mutex);
1944 }
1945