• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <stdlib.h>
25 #include <unistd.h>
26 #include <limits.h>
27 #include <assert.h>
28 #include <sys/mman.h>
29 
30 #include "anv_private.h"
31 
32 #include "common/intel_aux_map.h"
33 #include "util/anon_file.h"
34 #include "util/futex.h"
35 
36 #ifdef HAVE_VALGRIND
37 #define VG_NOACCESS_READ(__ptr) ({                       \
38    VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \
39    __typeof(*(__ptr)) __val = *(__ptr);                  \
40    VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\
41    __val;                                                \
42 })
43 #define VG_NOACCESS_WRITE(__ptr, __val) ({                  \
44    VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr)));  \
45    *(__ptr) = (__val);                                      \
46    VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));   \
47 })
48 #else
49 #define VG_NOACCESS_READ(__ptr) (*(__ptr))
50 #define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val))
51 #endif
52 
53 #ifndef MAP_POPULATE
54 #define MAP_POPULATE 0
55 #endif
56 
57 /* Design goals:
58  *
59  *  - Lock free (except when resizing underlying bos)
60  *
61  *  - Constant time allocation with typically only one atomic
62  *
63  *  - Multiple allocation sizes without fragmentation
64  *
65  *  - Can grow while keeping addresses and offset of contents stable
66  *
67  *  - All allocations within one bo so we can point one of the
68  *    STATE_BASE_ADDRESS pointers at it.
69  *
70  * The overall design is a two-level allocator: top level is a fixed size, big
71  * block (8k) allocator, which operates out of a bo.  Allocation is done by
72  * either pulling a block from the free list or growing the used range of the
73  * bo.  Growing the range may run out of space in the bo which we then need to
74  * grow.  Growing the bo is tricky in a multi-threaded, lockless environment:
75  * we need to keep all pointers and contents in the old map valid.  GEM bos in
76  * general can't grow, but we use a trick: we create a memfd and use ftruncate
77  * to grow it as necessary.  We mmap the new size and then create a gem bo for
78  * it using the new gem userptr ioctl.  Without heavy-handed locking around
79  * our allocation fast-path, there isn't really a way to munmap the old mmap,
80  * so we just keep it around until garbage collection time.  While the block
81  * allocator is lockless for normal operations, we block other threads trying
82  * to allocate while we're growing the map.  It shouldn't happen often, and
83  * growing is fast anyway.
84  *
85  * At the next level we can use various sub-allocators.  The state pool is a
86  * pool of smaller, fixed size objects, which operates much like the block
87  * pool.  It uses a free list for freeing objects, but when it runs out of
88  * space it just allocates a new block from the block pool.  This allocator is
89  * intended for longer lived state objects such as SURFACE_STATE and most
90  * other persistent state objects in the API.  We may need to track more info
91  * with these object and a pointer back to the CPU object (eg VkImage).  In
92  * those cases we just allocate a slightly bigger object and put the extra
93  * state after the GPU state object.
94  *
95  * The state stream allocator works similar to how the i965 DRI driver streams
96  * all its state.  Even with Vulkan, we need to emit transient state (whether
97  * surface state base or dynamic state base), and for that we can just get a
98  * block and fill it up.  These cases are local to a command buffer and the
99  * sub-allocator need not be thread safe.  The streaming allocator gets a new
100  * block when it runs out of space and chains them together so they can be
101  * easily freed.
102  */
103 
104 /* Allocations are always at least 64 byte aligned, so 1 is an invalid value.
105  * We use it to indicate the free list is empty. */
106 #define EMPTY UINT32_MAX
107 
108 /* On FreeBSD PAGE_SIZE is already defined in
109  * /usr/include/machine/param.h that is indirectly
110  * included here.
111  */
112 #ifndef PAGE_SIZE
113 #define PAGE_SIZE 4096
114 #endif
115 
116 struct anv_mmap_cleanup {
117    void *map;
118    size_t size;
119 };
120 
121 static inline uint32_t
ilog2_round_up(uint32_t value)122 ilog2_round_up(uint32_t value)
123 {
124    assert(value != 0);
125    return 32 - __builtin_clz(value - 1);
126 }
127 
128 static inline uint32_t
round_to_power_of_two(uint32_t value)129 round_to_power_of_two(uint32_t value)
130 {
131    return 1 << ilog2_round_up(value);
132 }
133 
134 struct anv_state_table_cleanup {
135    void *map;
136    size_t size;
137 };
138 
139 #define ANV_STATE_TABLE_CLEANUP_INIT ((struct anv_state_table_cleanup){0})
140 #define ANV_STATE_ENTRY_SIZE (sizeof(struct anv_free_entry))
141 
142 static VkResult
143 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size);
144 
145 VkResult
anv_state_table_init(struct anv_state_table * table,struct anv_device * device,uint32_t initial_entries)146 anv_state_table_init(struct anv_state_table *table,
147                     struct anv_device *device,
148                     uint32_t initial_entries)
149 {
150    VkResult result;
151 
152    table->device = device;
153 
154    /* Just make it 2GB up-front.  The Linux kernel won't actually back it
155     * with pages until we either map and fault on one of them or we use
156     * userptr and send a chunk of it off to the GPU.
157     */
158    table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table");
159    if (table->fd == -1)
160       return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
161 
162    if (!u_vector_init(&table->cleanups, 8,
163                       sizeof(struct anv_state_table_cleanup))) {
164       result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
165       goto fail_fd;
166    }
167 
168    table->state.next = 0;
169    table->state.end = 0;
170    table->size = 0;
171 
172    uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;
173    result = anv_state_table_expand_range(table, initial_size);
174    if (result != VK_SUCCESS)
175       goto fail_cleanups;
176 
177    return VK_SUCCESS;
178 
179  fail_cleanups:
180    u_vector_finish(&table->cleanups);
181  fail_fd:
182    close(table->fd);
183 
184    return result;
185 }
186 
187 static VkResult
anv_state_table_expand_range(struct anv_state_table * table,uint32_t size)188 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
189 {
190    void *map;
191    struct anv_state_table_cleanup *cleanup;
192 
193    /* Assert that we only ever grow the pool */
194    assert(size >= table->state.end);
195 
196    /* Make sure that we don't go outside the bounds of the memfd */
197    if (size > BLOCK_POOL_MEMFD_SIZE)
198       return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
199 
200    cleanup = u_vector_add(&table->cleanups);
201    if (!cleanup)
202       return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
203 
204    *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
205 
206    /* Just leak the old map until we destroy the pool.  We can't munmap it
207     * without races or imposing locking on the block allocate fast path. On
208     * the whole the leaked maps adds up to less than the size of the
209     * current map.  MAP_POPULATE seems like the right thing to do, but we
210     * should try to get some numbers.
211     */
212    map = mmap(NULL, size, PROT_READ | PROT_WRITE,
213               MAP_SHARED | MAP_POPULATE, table->fd, 0);
214    if (map == MAP_FAILED) {
215       return vk_errorf(table->device, VK_ERROR_OUT_OF_HOST_MEMORY,
216                        "mmap failed: %m");
217    }
218 
219    cleanup->map = map;
220    cleanup->size = size;
221 
222    table->map = map;
223    table->size = size;
224 
225    return VK_SUCCESS;
226 }
227 
228 static VkResult
anv_state_table_grow(struct anv_state_table * table)229 anv_state_table_grow(struct anv_state_table *table)
230 {
231    VkResult result = VK_SUCCESS;
232 
233    uint32_t used = align_u32(table->state.next * ANV_STATE_ENTRY_SIZE,
234                              PAGE_SIZE);
235    uint32_t old_size = table->size;
236 
237    /* The block pool is always initialized to a nonzero size and this function
238     * is always called after initialization.
239     */
240    assert(old_size > 0);
241 
242    uint32_t required = MAX2(used, old_size);
243    if (used * 2 <= required) {
244       /* If we're in this case then this isn't the firsta allocation and we
245        * already have enough space on both sides to hold double what we
246        * have allocated.  There's nothing for us to do.
247        */
248       goto done;
249    }
250 
251    uint32_t size = old_size * 2;
252    while (size < required)
253       size *= 2;
254 
255    assert(size > table->size);
256 
257    result = anv_state_table_expand_range(table, size);
258 
259  done:
260    return result;
261 }
262 
263 void
anv_state_table_finish(struct anv_state_table * table)264 anv_state_table_finish(struct anv_state_table *table)
265 {
266    struct anv_state_table_cleanup *cleanup;
267 
268    u_vector_foreach(cleanup, &table->cleanups) {
269       if (cleanup->map)
270          munmap(cleanup->map, cleanup->size);
271    }
272 
273    u_vector_finish(&table->cleanups);
274 
275    close(table->fd);
276 }
277 
278 VkResult
anv_state_table_add(struct anv_state_table * table,uint32_t * idx,uint32_t count)279 anv_state_table_add(struct anv_state_table *table, uint32_t *idx,
280                     uint32_t count)
281 {
282    struct anv_block_state state, old, new;
283    VkResult result;
284 
285    assert(idx);
286 
287    while(1) {
288       state.u64 = __sync_fetch_and_add(&table->state.u64, count);
289       if (state.next + count <= state.end) {
290          assert(table->map);
291          struct anv_free_entry *entry = &table->map[state.next];
292          for (int i = 0; i < count; i++) {
293             entry[i].state.idx = state.next + i;
294          }
295          *idx = state.next;
296          return VK_SUCCESS;
297       } else if (state.next <= state.end) {
298          /* We allocated the first block outside the pool so we have to grow
299           * the pool.  pool_state->next acts a mutex: threads who try to
300           * allocate now will get block indexes above the current limit and
301           * hit futex_wait below.
302           */
303          new.next = state.next + count;
304          do {
305             result = anv_state_table_grow(table);
306             if (result != VK_SUCCESS)
307                return result;
308             new.end = table->size / ANV_STATE_ENTRY_SIZE;
309          } while (new.end < new.next);
310 
311          old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64);
312          if (old.next != state.next)
313             futex_wake(&table->state.end, INT_MAX);
314       } else {
315          futex_wait(&table->state.end, state.end, NULL);
316          continue;
317       }
318    }
319 }
320 
321 void
anv_free_list_push(union anv_free_list * list,struct anv_state_table * table,uint32_t first,uint32_t count)322 anv_free_list_push(union anv_free_list *list,
323                    struct anv_state_table *table,
324                    uint32_t first, uint32_t count)
325 {
326    union anv_free_list current, old, new;
327    uint32_t last = first;
328 
329    for (uint32_t i = 1; i < count; i++, last++)
330       table->map[last].next = last + 1;
331 
332    old.u64 = list->u64;
333    do {
334       current = old;
335       table->map[last].next = current.offset;
336       new.offset = first;
337       new.count = current.count + 1;
338       old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
339    } while (old.u64 != current.u64);
340 }
341 
342 struct anv_state *
anv_free_list_pop(union anv_free_list * list,struct anv_state_table * table)343 anv_free_list_pop(union anv_free_list *list,
344                   struct anv_state_table *table)
345 {
346    union anv_free_list current, new, old;
347 
348    current.u64 = list->u64;
349    while (current.offset != EMPTY) {
350       __sync_synchronize();
351       new.offset = table->map[current.offset].next;
352       new.count = current.count + 1;
353       old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
354       if (old.u64 == current.u64) {
355          struct anv_free_entry *entry = &table->map[current.offset];
356          return &entry->state;
357       }
358       current = old;
359    }
360 
361    return NULL;
362 }
363 
364 static VkResult
365 anv_block_pool_expand_range(struct anv_block_pool *pool,
366                             uint32_t center_bo_offset, uint32_t size);
367 
368 VkResult
anv_block_pool_init(struct anv_block_pool * pool,struct anv_device * device,const char * name,uint64_t start_address,uint32_t initial_size)369 anv_block_pool_init(struct anv_block_pool *pool,
370                     struct anv_device *device,
371                     const char *name,
372                     uint64_t start_address,
373                     uint32_t initial_size)
374 {
375    VkResult result;
376 
377    if (device->info.verx10 >= 125) {
378       /* Make sure VMA addresses are 2MiB aligned for the block pool */
379       assert(anv_is_aligned(start_address, 2 * 1024 * 1024));
380       assert(anv_is_aligned(initial_size, 2 * 1024 * 1024));
381    }
382 
383    pool->name = name;
384    pool->device = device;
385    pool->use_relocations = anv_use_relocations(device->physical);
386    pool->nbos = 0;
387    pool->size = 0;
388    pool->center_bo_offset = 0;
389    pool->start_address = intel_canonical_address(start_address);
390    pool->map = NULL;
391 
392    if (!pool->use_relocations) {
393       pool->bo = NULL;
394       pool->fd = -1;
395    } else {
396       /* Just make it 2GB up-front.  The Linux kernel won't actually back it
397        * with pages until we either map and fault on one of them or we use
398        * userptr and send a chunk of it off to the GPU.
399        */
400       pool->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "block pool");
401       if (pool->fd == -1)
402          return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
403 
404       pool->wrapper_bo = (struct anv_bo) {
405          .refcount = 1,
406          .offset = -1,
407          .is_wrapper = true,
408       };
409       pool->bo = &pool->wrapper_bo;
410    }
411 
412    if (!u_vector_init(&pool->mmap_cleanups, 8,
413                       sizeof(struct anv_mmap_cleanup))) {
414       result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
415       goto fail_fd;
416    }
417 
418    pool->state.next = 0;
419    pool->state.end = 0;
420    pool->back_state.next = 0;
421    pool->back_state.end = 0;
422 
423    result = anv_block_pool_expand_range(pool, 0, initial_size);
424    if (result != VK_SUCCESS)
425       goto fail_mmap_cleanups;
426 
427    /* Make the entire pool available in the front of the pool.  If back
428     * allocation needs to use this space, the "ends" will be re-arranged.
429     */
430    pool->state.end = pool->size;
431 
432    return VK_SUCCESS;
433 
434  fail_mmap_cleanups:
435    u_vector_finish(&pool->mmap_cleanups);
436  fail_fd:
437    if (pool->fd >= 0)
438       close(pool->fd);
439 
440    return result;
441 }
442 
443 void
anv_block_pool_finish(struct anv_block_pool * pool)444 anv_block_pool_finish(struct anv_block_pool *pool)
445 {
446    anv_block_pool_foreach_bo(bo, pool) {
447       assert(bo->refcount == 1);
448       anv_device_release_bo(pool->device, bo);
449    }
450 
451    struct anv_mmap_cleanup *cleanup;
452    u_vector_foreach(cleanup, &pool->mmap_cleanups)
453       munmap(cleanup->map, cleanup->size);
454    u_vector_finish(&pool->mmap_cleanups);
455 
456    if (pool->fd >= 0)
457       close(pool->fd);
458 }
459 
460 static VkResult
anv_block_pool_expand_range(struct anv_block_pool * pool,uint32_t center_bo_offset,uint32_t size)461 anv_block_pool_expand_range(struct anv_block_pool *pool,
462                             uint32_t center_bo_offset, uint32_t size)
463 {
464    /* Assert that we only ever grow the pool */
465    assert(center_bo_offset >= pool->back_state.end);
466    assert(size - center_bo_offset >= pool->state.end);
467 
468    /* Assert that we don't go outside the bounds of the memfd */
469    assert(center_bo_offset <= BLOCK_POOL_MEMFD_CENTER);
470    assert(!pool->use_relocations ||
471           size - center_bo_offset <=
472           BLOCK_POOL_MEMFD_SIZE - BLOCK_POOL_MEMFD_CENTER);
473 
474    /* For state pool BOs we have to be a bit careful about where we place them
475     * in the GTT.  There are two documented workarounds for state base address
476     * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset
477     * which state that those two base addresses do not support 48-bit
478     * addresses and need to be placed in the bottom 32-bit range.
479     * Unfortunately, this is not quite accurate.
480     *
481     * The real problem is that we always set the size of our state pools in
482     * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most
483     * likely significantly smaller.  We do this because we do not no at the
484     * time we emit STATE_BASE_ADDRESS whether or not we will need to expand
485     * the pool during command buffer building so we don't actually have a
486     * valid final size.  If the address + size, as seen by STATE_BASE_ADDRESS
487     * overflows 48 bits, the GPU appears to treat all accesses to the buffer
488     * as being out of bounds and returns zero.  For dynamic state, this
489     * usually just leads to rendering corruptions, but shaders that are all
490     * zero hang the GPU immediately.
491     *
492     * The easiest solution to do is exactly what the bogus workarounds say to
493     * do: restrict these buffers to 32-bit addresses.  We could also pin the
494     * BO to some particular location of our choosing, but that's significantly
495     * more work than just not setting a flag.  So, we explicitly DO NOT set
496     * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the
497     * hard work for us.  When using softpin, we're in control and the fixed
498     * addresses we choose are fine for base addresses.
499     */
500    enum anv_bo_alloc_flags bo_alloc_flags = ANV_BO_ALLOC_CAPTURE;
501    if (pool->use_relocations)
502       bo_alloc_flags |= ANV_BO_ALLOC_32BIT_ADDRESS;
503 
504    if (!pool->use_relocations) {
505       uint32_t new_bo_size = size - pool->size;
506       struct anv_bo *new_bo;
507       assert(center_bo_offset == 0);
508       VkResult result = anv_device_alloc_bo(pool->device,
509                                             pool->name,
510                                             new_bo_size,
511                                             bo_alloc_flags |
512                                             ANV_BO_ALLOC_LOCAL_MEM |
513                                             ANV_BO_ALLOC_FIXED_ADDRESS |
514                                             ANV_BO_ALLOC_MAPPED |
515                                             ANV_BO_ALLOC_SNOOPED,
516                                             pool->start_address + pool->size,
517                                             &new_bo);
518       if (result != VK_SUCCESS)
519          return result;
520 
521       pool->bos[pool->nbos++] = new_bo;
522 
523       /* This pointer will always point to the first BO in the list */
524       pool->bo = pool->bos[0];
525    } else {
526       /* Just leak the old map until we destroy the pool.  We can't munmap it
527        * without races or imposing locking on the block allocate fast path. On
528        * the whole the leaked maps adds up to less than the size of the
529        * current map.  MAP_POPULATE seems like the right thing to do, but we
530        * should try to get some numbers.
531        */
532       void *map = mmap(NULL, size, PROT_READ | PROT_WRITE,
533                        MAP_SHARED | MAP_POPULATE, pool->fd,
534                        BLOCK_POOL_MEMFD_CENTER - center_bo_offset);
535       if (map == MAP_FAILED)
536          return vk_errorf(pool->device, VK_ERROR_MEMORY_MAP_FAILED,
537                           "mmap failed: %m");
538 
539       struct anv_bo *new_bo;
540       VkResult result = anv_device_import_bo_from_host_ptr(pool->device,
541                                                            map, size,
542                                                            bo_alloc_flags,
543                                                            0 /* client_address */,
544                                                            &new_bo);
545       if (result != VK_SUCCESS) {
546          munmap(map, size);
547          return result;
548       }
549 
550       struct anv_mmap_cleanup *cleanup = u_vector_add(&pool->mmap_cleanups);
551       if (!cleanup) {
552          munmap(map, size);
553          anv_device_release_bo(pool->device, new_bo);
554          return vk_error(pool->device, VK_ERROR_OUT_OF_HOST_MEMORY);
555       }
556       cleanup->map = map;
557       cleanup->size = size;
558 
559       /* Now that we mapped the new memory, we can write the new
560        * center_bo_offset back into pool and update pool->map. */
561       pool->center_bo_offset = center_bo_offset;
562       pool->map = map + center_bo_offset;
563 
564       pool->bos[pool->nbos++] = new_bo;
565       pool->wrapper_bo.map = new_bo;
566    }
567 
568    assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS);
569    pool->size = size;
570 
571    return VK_SUCCESS;
572 }
573 
574 /** Returns current memory map of the block pool.
575  *
576  * The returned pointer points to the map for the memory at the specified
577  * offset. The offset parameter is relative to the "center" of the block pool
578  * rather than the start of the block pool BO map.
579  */
580 void*
anv_block_pool_map(struct anv_block_pool * pool,int32_t offset,uint32_t size)581 anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
582 {
583    if (!pool->use_relocations) {
584       struct anv_bo *bo = NULL;
585       int32_t bo_offset = 0;
586       anv_block_pool_foreach_bo(iter_bo, pool) {
587          if (offset < bo_offset + iter_bo->size) {
588             bo = iter_bo;
589             break;
590          }
591          bo_offset += iter_bo->size;
592       }
593       assert(bo != NULL);
594       assert(offset >= bo_offset);
595       assert((offset - bo_offset) + size <= bo->size);
596 
597       return bo->map + (offset - bo_offset);
598    } else {
599       return pool->map + offset;
600    }
601 }
602 
603 /** Grows and re-centers the block pool.
604  *
605  * We grow the block pool in one or both directions in such a way that the
606  * following conditions are met:
607  *
608  *  1) The size of the entire pool is always a power of two.
609  *
610  *  2) The pool only grows on both ends.  Neither end can get
611  *     shortened.
612  *
613  *  3) At the end of the allocation, we have about twice as much space
614  *     allocated for each end as we have used.  This way the pool doesn't
615  *     grow too far in one direction or the other.
616  *
617  *  4) If the _alloc_back() has never been called, then the back portion of
618  *     the pool retains a size of zero.  (This makes it easier for users of
619  *     the block pool that only want a one-sided pool.)
620  *
621  *  5) We have enough space allocated for at least one more block in
622  *     whichever side `state` points to.
623  *
624  *  6) The center of the pool is always aligned to both the block_size of
625  *     the pool and a 4K CPU page.
626  */
627 static uint32_t
anv_block_pool_grow(struct anv_block_pool * pool,struct anv_block_state * state,uint32_t contiguous_size)628 anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
629                     uint32_t contiguous_size)
630 {
631    VkResult result = VK_SUCCESS;
632 
633    pthread_mutex_lock(&pool->device->mutex);
634 
635    assert(state == &pool->state || state == &pool->back_state);
636 
637    /* Gather a little usage information on the pool.  Since we may have
638     * threadsd waiting in queue to get some storage while we resize, it's
639     * actually possible that total_used will be larger than old_size.  In
640     * particular, block_pool_alloc() increments state->next prior to
641     * calling block_pool_grow, so this ensures that we get enough space for
642     * which ever side tries to grow the pool.
643     *
644     * We align to a page size because it makes it easier to do our
645     * calculations later in such a way that we state page-aigned.
646     */
647    uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE);
648    uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE);
649    uint32_t total_used = front_used + back_used;
650 
651    assert(state == &pool->state || back_used > 0);
652 
653    uint32_t old_size = pool->size;
654 
655    /* The block pool is always initialized to a nonzero size and this function
656     * is always called after initialization.
657     */
658    assert(old_size > 0);
659 
660    const uint32_t old_back = pool->center_bo_offset;
661    const uint32_t old_front = old_size - pool->center_bo_offset;
662 
663    /* The back_used and front_used may actually be smaller than the actual
664     * requirement because they are based on the next pointers which are
665     * updated prior to calling this function.
666     */
667    uint32_t back_required = MAX2(back_used, old_back);
668    uint32_t front_required = MAX2(front_used, old_front);
669 
670    if (!pool->use_relocations) {
671       /* With softpin, the pool is made up of a bunch of buffers with separate
672        * maps.  Make sure we have enough contiguous space that we can get a
673        * properly contiguous map for the next chunk.
674        */
675       assert(old_back == 0);
676       front_required = MAX2(front_required, old_front + contiguous_size);
677    }
678 
679    if (back_used * 2 <= back_required && front_used * 2 <= front_required) {
680       /* If we're in this case then this isn't the firsta allocation and we
681        * already have enough space on both sides to hold double what we
682        * have allocated.  There's nothing for us to do.
683        */
684       goto done;
685    }
686 
687    uint32_t size = old_size * 2;
688    while (size < back_required + front_required)
689       size *= 2;
690 
691    assert(size > pool->size);
692 
693    /* We compute a new center_bo_offset such that, when we double the size
694     * of the pool, we maintain the ratio of how much is used by each side.
695     * This way things should remain more-or-less balanced.
696     */
697    uint32_t center_bo_offset;
698    if (back_used == 0) {
699       /* If we're in this case then we have never called alloc_back().  In
700        * this case, we want keep the offset at 0 to make things as simple
701        * as possible for users that don't care about back allocations.
702        */
703       center_bo_offset = 0;
704    } else {
705       /* Try to "center" the allocation based on how much is currently in
706        * use on each side of the center line.
707        */
708       center_bo_offset = ((uint64_t)size * back_used) / total_used;
709 
710       /* Align down to a multiple of the page size */
711       center_bo_offset &= ~(PAGE_SIZE - 1);
712 
713       assert(center_bo_offset >= back_used);
714 
715       /* Make sure we don't shrink the back end of the pool */
716       if (center_bo_offset < back_required)
717          center_bo_offset = back_required;
718 
719       /* Make sure that we don't shrink the front end of the pool */
720       if (size - center_bo_offset < front_required)
721          center_bo_offset = size - front_required;
722    }
723 
724    assert(center_bo_offset % PAGE_SIZE == 0);
725 
726    result = anv_block_pool_expand_range(pool, center_bo_offset, size);
727 
728 done:
729    pthread_mutex_unlock(&pool->device->mutex);
730 
731    if (result == VK_SUCCESS) {
732       /* Return the appropriate new size.  This function never actually
733        * updates state->next.  Instead, we let the caller do that because it
734        * needs to do so in order to maintain its concurrency model.
735        */
736       if (state == &pool->state) {
737          return pool->size - pool->center_bo_offset;
738       } else {
739          assert(pool->center_bo_offset > 0);
740          return pool->center_bo_offset;
741       }
742    } else {
743       return 0;
744    }
745 }
746 
747 static uint32_t
anv_block_pool_alloc_new(struct anv_block_pool * pool,struct anv_block_state * pool_state,uint32_t block_size,uint32_t * padding)748 anv_block_pool_alloc_new(struct anv_block_pool *pool,
749                          struct anv_block_state *pool_state,
750                          uint32_t block_size, uint32_t *padding)
751 {
752    struct anv_block_state state, old, new;
753 
754    /* Most allocations won't generate any padding */
755    if (padding)
756       *padding = 0;
757 
758    while (1) {
759       state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);
760       if (state.next + block_size <= state.end) {
761          return state.next;
762       } else if (state.next <= state.end) {
763          if (!pool->use_relocations && state.next < state.end) {
764             /* We need to grow the block pool, but still have some leftover
765              * space that can't be used by that particular allocation. So we
766              * add that as a "padding", and return it.
767              */
768             uint32_t leftover = state.end - state.next;
769 
770             /* If there is some leftover space in the pool, the caller must
771              * deal with it.
772              */
773             assert(leftover == 0 || padding);
774             if (padding)
775                *padding = leftover;
776             state.next += leftover;
777          }
778 
779          /* We allocated the first block outside the pool so we have to grow
780           * the pool.  pool_state->next acts a mutex: threads who try to
781           * allocate now will get block indexes above the current limit and
782           * hit futex_wait below.
783           */
784          new.next = state.next + block_size;
785          do {
786             new.end = anv_block_pool_grow(pool, pool_state, block_size);
787          } while (new.end < new.next);
788 
789          old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
790          if (old.next != state.next)
791             futex_wake(&pool_state->end, INT_MAX);
792          return state.next;
793       } else {
794          futex_wait(&pool_state->end, state.end, NULL);
795          continue;
796       }
797    }
798 }
799 
800 int32_t
anv_block_pool_alloc(struct anv_block_pool * pool,uint32_t block_size,uint32_t * padding)801 anv_block_pool_alloc(struct anv_block_pool *pool,
802                      uint32_t block_size, uint32_t *padding)
803 {
804    uint32_t offset;
805 
806    offset = anv_block_pool_alloc_new(pool, &pool->state, block_size, padding);
807 
808    return offset;
809 }
810 
811 /* Allocates a block out of the back of the block pool.
812  *
813  * This will allocated a block earlier than the "start" of the block pool.
814  * The offsets returned from this function will be negative but will still
815  * be correct relative to the block pool's map pointer.
816  *
817  * If you ever use anv_block_pool_alloc_back, then you will have to do
818  * gymnastics with the block pool's BO when doing relocations.
819  */
820 int32_t
anv_block_pool_alloc_back(struct anv_block_pool * pool,uint32_t block_size)821 anv_block_pool_alloc_back(struct anv_block_pool *pool,
822                           uint32_t block_size)
823 {
824    int32_t offset = anv_block_pool_alloc_new(pool, &pool->back_state,
825                                              block_size, NULL);
826 
827    /* The offset we get out of anv_block_pool_alloc_new() is actually the
828     * number of bytes downwards from the middle to the end of the block.
829     * We need to turn it into a (negative) offset from the middle to the
830     * start of the block.
831     */
832    assert(offset >= 0);
833    return -(offset + block_size);
834 }
835 
836 VkResult
anv_state_pool_init(struct anv_state_pool * pool,struct anv_device * device,const char * name,uint64_t base_address,int32_t start_offset,uint32_t block_size)837 anv_state_pool_init(struct anv_state_pool *pool,
838                     struct anv_device *device,
839                     const char *name,
840                     uint64_t base_address,
841                     int32_t start_offset,
842                     uint32_t block_size)
843 {
844    /* We don't want to ever see signed overflow */
845    assert(start_offset < INT32_MAX - (int32_t)BLOCK_POOL_MEMFD_SIZE);
846 
847    uint32_t initial_size = block_size * 16;
848    if (device->info.verx10 >= 125)
849       initial_size = MAX2(initial_size, 2 * 1024 * 1024);
850 
851    VkResult result = anv_block_pool_init(&pool->block_pool, device, name,
852                                          base_address + start_offset,
853                                          initial_size);
854    if (result != VK_SUCCESS)
855       return result;
856 
857    pool->start_offset = start_offset;
858 
859    result = anv_state_table_init(&pool->table, device, 64);
860    if (result != VK_SUCCESS) {
861       anv_block_pool_finish(&pool->block_pool);
862       return result;
863    }
864 
865    assert(util_is_power_of_two_or_zero(block_size));
866    pool->block_size = block_size;
867    pool->back_alloc_free_list = ANV_FREE_LIST_EMPTY;
868    for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
869       pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;
870       pool->buckets[i].block.next = 0;
871       pool->buckets[i].block.end = 0;
872    }
873    VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
874 
875    return VK_SUCCESS;
876 }
877 
878 void
anv_state_pool_finish(struct anv_state_pool * pool)879 anv_state_pool_finish(struct anv_state_pool *pool)
880 {
881    VG(VALGRIND_DESTROY_MEMPOOL(pool));
882    anv_state_table_finish(&pool->table);
883    anv_block_pool_finish(&pool->block_pool);
884 }
885 
886 static uint32_t
anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool * pool,struct anv_block_pool * block_pool,uint32_t state_size,uint32_t block_size,uint32_t * padding)887 anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
888                                     struct anv_block_pool *block_pool,
889                                     uint32_t state_size,
890                                     uint32_t block_size,
891                                     uint32_t *padding)
892 {
893    struct anv_block_state block, old, new;
894    uint32_t offset;
895 
896    /* We don't always use anv_block_pool_alloc(), which would set *padding to
897     * zero for us. So if we have a pointer to padding, we must zero it out
898     * ourselves here, to make sure we always return some sensible value.
899     */
900    if (padding)
901       *padding = 0;
902 
903    /* If our state is large, we don't need any sub-allocation from a block.
904     * Instead, we just grab whole (potentially large) blocks.
905     */
906    if (state_size >= block_size)
907       return anv_block_pool_alloc(block_pool, state_size, padding);
908 
909  restart:
910    block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);
911 
912    if (block.next < block.end) {
913       return block.next;
914    } else if (block.next == block.end) {
915       offset = anv_block_pool_alloc(block_pool, block_size, padding);
916       new.next = offset + state_size;
917       new.end = offset + block_size;
918       old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
919       if (old.next != block.next)
920          futex_wake(&pool->block.end, INT_MAX);
921       return offset;
922    } else {
923       futex_wait(&pool->block.end, block.end, NULL);
924       goto restart;
925    }
926 }
927 
928 static uint32_t
anv_state_pool_get_bucket(uint32_t size)929 anv_state_pool_get_bucket(uint32_t size)
930 {
931    unsigned size_log2 = ilog2_round_up(size);
932    assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
933    if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
934       size_log2 = ANV_MIN_STATE_SIZE_LOG2;
935    return size_log2 - ANV_MIN_STATE_SIZE_LOG2;
936 }
937 
938 static uint32_t
anv_state_pool_get_bucket_size(uint32_t bucket)939 anv_state_pool_get_bucket_size(uint32_t bucket)
940 {
941    uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2;
942    return 1 << size_log2;
943 }
944 
945 /** Helper to push a chunk into the state table.
946  *
947  * It creates 'count' entries into the state table and update their sizes,
948  * offsets and maps, also pushing them as "free" states.
949  */
950 static void
anv_state_pool_return_blocks(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t count,uint32_t block_size)951 anv_state_pool_return_blocks(struct anv_state_pool *pool,
952                              uint32_t chunk_offset, uint32_t count,
953                              uint32_t block_size)
954 {
955    /* Disallow returning 0 chunks */
956    assert(count != 0);
957 
958    /* Make sure we always return chunks aligned to the block_size */
959    assert(chunk_offset % block_size == 0);
960 
961    uint32_t st_idx;
962    UNUSED VkResult result = anv_state_table_add(&pool->table, &st_idx, count);
963    assert(result == VK_SUCCESS);
964    for (int i = 0; i < count; i++) {
965       /* update states that were added back to the state table */
966       struct anv_state *state_i = anv_state_table_get(&pool->table,
967                                                       st_idx + i);
968       state_i->alloc_size = block_size;
969       state_i->offset = pool->start_offset + chunk_offset + block_size * i;
970       state_i->map = anv_block_pool_map(&pool->block_pool,
971                                         state_i->offset,
972                                         state_i->alloc_size);
973    }
974 
975    uint32_t block_bucket = anv_state_pool_get_bucket(block_size);
976    anv_free_list_push(&pool->buckets[block_bucket].free_list,
977                       &pool->table, st_idx, count);
978 }
979 
980 /** Returns a chunk of memory back to the state pool.
981  *
982  * Do a two-level split. If chunk_size is bigger than divisor
983  * (pool->block_size), we return as many divisor sized blocks as we can, from
984  * the end of the chunk.
985  *
986  * The remaining is then split into smaller blocks (starting at small_size if
987  * it is non-zero), with larger blocks always being taken from the end of the
988  * chunk.
989  */
990 static void
anv_state_pool_return_chunk(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t chunk_size,uint32_t small_size)991 anv_state_pool_return_chunk(struct anv_state_pool *pool,
992                             uint32_t chunk_offset, uint32_t chunk_size,
993                             uint32_t small_size)
994 {
995    uint32_t divisor = pool->block_size;
996    uint32_t nblocks = chunk_size / divisor;
997    uint32_t rest = chunk_size - nblocks * divisor;
998 
999    if (nblocks > 0) {
1000       /* First return divisor aligned and sized chunks. We start returning
1001        * larger blocks from the end of the chunk, since they should already be
1002        * aligned to divisor. Also anv_state_pool_return_blocks() only accepts
1003        * aligned chunks.
1004        */
1005       uint32_t offset = chunk_offset + rest;
1006       anv_state_pool_return_blocks(pool, offset, nblocks, divisor);
1007    }
1008 
1009    chunk_size = rest;
1010    divisor /= 2;
1011 
1012    if (small_size > 0 && small_size < divisor)
1013       divisor = small_size;
1014 
1015    uint32_t min_size = 1 << ANV_MIN_STATE_SIZE_LOG2;
1016 
1017    /* Just as before, return larger divisor aligned blocks from the end of the
1018     * chunk first.
1019     */
1020    while (chunk_size > 0 && divisor >= min_size) {
1021       nblocks = chunk_size / divisor;
1022       rest = chunk_size - nblocks * divisor;
1023       if (nblocks > 0) {
1024          anv_state_pool_return_blocks(pool, chunk_offset + rest,
1025                                       nblocks, divisor);
1026          chunk_size = rest;
1027       }
1028       divisor /= 2;
1029    }
1030 }
1031 
1032 static struct anv_state
anv_state_pool_alloc_no_vg(struct anv_state_pool * pool,uint32_t size,uint32_t align)1033 anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
1034                            uint32_t size, uint32_t align)
1035 {
1036    uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align));
1037 
1038    struct anv_state *state;
1039    uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket);
1040    int32_t offset;
1041 
1042    /* Try free list first. */
1043    state = anv_free_list_pop(&pool->buckets[bucket].free_list,
1044                              &pool->table);
1045    if (state) {
1046       assert(state->offset >= pool->start_offset);
1047       goto done;
1048    }
1049 
1050    /* Try to grab a chunk from some larger bucket and split it up */
1051    for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) {
1052       state = anv_free_list_pop(&pool->buckets[b].free_list, &pool->table);
1053       if (state) {
1054          unsigned chunk_size = anv_state_pool_get_bucket_size(b);
1055          int32_t chunk_offset = state->offset;
1056 
1057          /* First lets update the state we got to its new size. offset and map
1058           * remain the same.
1059           */
1060          state->alloc_size = alloc_size;
1061 
1062          /* Now return the unused part of the chunk back to the pool as free
1063           * blocks
1064           *
1065           * There are a couple of options as to what we do with it:
1066           *
1067           *    1) We could fully split the chunk into state.alloc_size sized
1068           *       pieces.  However, this would mean that allocating a 16B
1069           *       state could potentially split a 2MB chunk into 512K smaller
1070           *       chunks.  This would lead to unnecessary fragmentation.
1071           *
1072           *    2) The classic "buddy allocator" method would have us split the
1073           *       chunk in half and return one half.  Then we would split the
1074           *       remaining half in half and return one half, and repeat as
1075           *       needed until we get down to the size we want.  However, if
1076           *       you are allocating a bunch of the same size state (which is
1077           *       the common case), this means that every other allocation has
1078           *       to go up a level and every fourth goes up two levels, etc.
1079           *       This is not nearly as efficient as it could be if we did a
1080           *       little more work up-front.
1081           *
1082           *    3) Split the difference between (1) and (2) by doing a
1083           *       two-level split.  If it's bigger than some fixed block_size,
1084           *       we split it into block_size sized chunks and return all but
1085           *       one of them.  Then we split what remains into
1086           *       state.alloc_size sized chunks and return them.
1087           *
1088           * We choose something close to option (3), which is implemented with
1089           * anv_state_pool_return_chunk(). That is done by returning the
1090           * remaining of the chunk, with alloc_size as a hint of the size that
1091           * we want the smaller chunk split into.
1092           */
1093          anv_state_pool_return_chunk(pool, chunk_offset + alloc_size,
1094                                      chunk_size - alloc_size, alloc_size);
1095          goto done;
1096       }
1097    }
1098 
1099    uint32_t padding;
1100    offset = anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
1101                                                 &pool->block_pool,
1102                                                 alloc_size,
1103                                                 pool->block_size,
1104                                                 &padding);
1105    /* Every time we allocate a new state, add it to the state pool */
1106    uint32_t idx;
1107    UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);
1108    assert(result == VK_SUCCESS);
1109 
1110    state = anv_state_table_get(&pool->table, idx);
1111    state->offset = pool->start_offset + offset;
1112    state->alloc_size = alloc_size;
1113    state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
1114 
1115    if (padding > 0) {
1116       uint32_t return_offset = offset - padding;
1117       anv_state_pool_return_chunk(pool, return_offset, padding, 0);
1118    }
1119 
1120 done:
1121    return *state;
1122 }
1123 
1124 struct anv_state
anv_state_pool_alloc(struct anv_state_pool * pool,uint32_t size,uint32_t align)1125 anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)
1126 {
1127    if (size == 0)
1128       return ANV_STATE_NULL;
1129 
1130    struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align);
1131    VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size));
1132    return state;
1133 }
1134 
1135 struct anv_state
anv_state_pool_alloc_back(struct anv_state_pool * pool)1136 anv_state_pool_alloc_back(struct anv_state_pool *pool)
1137 {
1138    struct anv_state *state;
1139    uint32_t alloc_size = pool->block_size;
1140 
1141    /* This function is only used with pools where start_offset == 0 */
1142    assert(pool->start_offset == 0);
1143 
1144    state = anv_free_list_pop(&pool->back_alloc_free_list, &pool->table);
1145    if (state) {
1146       assert(state->offset < pool->start_offset);
1147       goto done;
1148    }
1149 
1150    int32_t offset;
1151    offset = anv_block_pool_alloc_back(&pool->block_pool,
1152                                       pool->block_size);
1153    uint32_t idx;
1154    UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);
1155    assert(result == VK_SUCCESS);
1156 
1157    state = anv_state_table_get(&pool->table, idx);
1158    state->offset = pool->start_offset + offset;
1159    state->alloc_size = alloc_size;
1160    state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
1161 
1162 done:
1163    VG(VALGRIND_MEMPOOL_ALLOC(pool, state->map, state->alloc_size));
1164    return *state;
1165 }
1166 
1167 static void
anv_state_pool_free_no_vg(struct anv_state_pool * pool,struct anv_state state)1168 anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)
1169 {
1170    assert(util_is_power_of_two_or_zero(state.alloc_size));
1171    unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);
1172 
1173    if (state.offset < pool->start_offset) {
1174       assert(state.alloc_size == pool->block_size);
1175       anv_free_list_push(&pool->back_alloc_free_list,
1176                          &pool->table, state.idx, 1);
1177    } else {
1178       anv_free_list_push(&pool->buckets[bucket].free_list,
1179                          &pool->table, state.idx, 1);
1180    }
1181 }
1182 
1183 void
anv_state_pool_free(struct anv_state_pool * pool,struct anv_state state)1184 anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)
1185 {
1186    if (state.alloc_size == 0)
1187       return;
1188 
1189    VG(VALGRIND_MEMPOOL_FREE(pool, state.map));
1190    anv_state_pool_free_no_vg(pool, state);
1191 }
1192 
1193 struct anv_state_stream_block {
1194    struct anv_state block;
1195 
1196    /* The next block */
1197    struct anv_state_stream_block *next;
1198 
1199 #ifdef HAVE_VALGRIND
1200    /* A pointer to the first user-allocated thing in this block.  This is
1201     * what valgrind sees as the start of the block.
1202     */
1203    void *_vg_ptr;
1204 #endif
1205 };
1206 
1207 /* The state stream allocator is a one-shot, single threaded allocator for
1208  * variable sized blocks.  We use it for allocating dynamic state.
1209  */
1210 void
anv_state_stream_init(struct anv_state_stream * stream,struct anv_state_pool * state_pool,uint32_t block_size)1211 anv_state_stream_init(struct anv_state_stream *stream,
1212                       struct anv_state_pool *state_pool,
1213                       uint32_t block_size)
1214 {
1215    stream->state_pool = state_pool;
1216    stream->block_size = block_size;
1217 
1218    stream->block = ANV_STATE_NULL;
1219 
1220    /* Ensure that next + whatever > block_size.  This way the first call to
1221     * state_stream_alloc fetches a new block.
1222     */
1223    stream->next = block_size;
1224 
1225    util_dynarray_init(&stream->all_blocks, NULL);
1226 
1227    VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
1228 }
1229 
1230 void
anv_state_stream_finish(struct anv_state_stream * stream)1231 anv_state_stream_finish(struct anv_state_stream *stream)
1232 {
1233    util_dynarray_foreach(&stream->all_blocks, struct anv_state, block) {
1234       VG(VALGRIND_MEMPOOL_FREE(stream, block->map));
1235       VG(VALGRIND_MAKE_MEM_NOACCESS(block->map, block->alloc_size));
1236       anv_state_pool_free_no_vg(stream->state_pool, *block);
1237    }
1238    util_dynarray_fini(&stream->all_blocks);
1239 
1240    VG(VALGRIND_DESTROY_MEMPOOL(stream));
1241 }
1242 
1243 struct anv_state
anv_state_stream_alloc(struct anv_state_stream * stream,uint32_t size,uint32_t alignment)1244 anv_state_stream_alloc(struct anv_state_stream *stream,
1245                        uint32_t size, uint32_t alignment)
1246 {
1247    if (size == 0)
1248       return ANV_STATE_NULL;
1249 
1250    assert(alignment <= PAGE_SIZE);
1251 
1252    uint32_t offset = align_u32(stream->next, alignment);
1253    if (offset + size > stream->block.alloc_size) {
1254       uint32_t block_size = stream->block_size;
1255       if (block_size < size)
1256          block_size = round_to_power_of_two(size);
1257 
1258       stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,
1259                                                  block_size, PAGE_SIZE);
1260       util_dynarray_append(&stream->all_blocks,
1261                            struct anv_state, stream->block);
1262       VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size));
1263 
1264       /* Reset back to the start */
1265       stream->next = offset = 0;
1266       assert(offset + size <= stream->block.alloc_size);
1267    }
1268    const bool new_block = stream->next == 0;
1269 
1270    struct anv_state state = stream->block;
1271    state.offset += offset;
1272    state.alloc_size = size;
1273    state.map += offset;
1274 
1275    stream->next = offset + size;
1276 
1277    if (new_block) {
1278       assert(state.map == stream->block.map);
1279       VG(VALGRIND_MEMPOOL_ALLOC(stream, state.map, size));
1280    } else {
1281       /* This only updates the mempool.  The newly allocated chunk is still
1282        * marked as NOACCESS. */
1283       VG(VALGRIND_MEMPOOL_CHANGE(stream, stream->block.map, stream->block.map,
1284                                  stream->next));
1285       /* Mark the newly allocated chunk as undefined */
1286       VG(VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size));
1287    }
1288 
1289    return state;
1290 }
1291 
1292 void
anv_state_reserved_pool_init(struct anv_state_reserved_pool * pool,struct anv_state_pool * parent,uint32_t count,uint32_t size,uint32_t alignment)1293 anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool,
1294                              struct anv_state_pool *parent,
1295                              uint32_t count, uint32_t size, uint32_t alignment)
1296 {
1297    pool->pool = parent;
1298    pool->reserved_blocks = ANV_FREE_LIST_EMPTY;
1299    pool->count = count;
1300 
1301    for (unsigned i = 0; i < count; i++) {
1302       struct anv_state state = anv_state_pool_alloc(pool->pool, size, alignment);
1303       anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1304    }
1305 }
1306 
1307 void
anv_state_reserved_pool_finish(struct anv_state_reserved_pool * pool)1308 anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool)
1309 {
1310    struct anv_state *state;
1311 
1312    while ((state = anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table))) {
1313       anv_state_pool_free(pool->pool, *state);
1314       pool->count--;
1315    }
1316    assert(pool->count == 0);
1317 }
1318 
1319 struct anv_state
anv_state_reserved_pool_alloc(struct anv_state_reserved_pool * pool)1320 anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool)
1321 {
1322    return *anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table);
1323 }
1324 
1325 void
anv_state_reserved_pool_free(struct anv_state_reserved_pool * pool,struct anv_state state)1326 anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
1327                              struct anv_state state)
1328 {
1329    anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1330 }
1331 
1332 void
anv_bo_pool_init(struct anv_bo_pool * pool,struct anv_device * device,const char * name)1333 anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
1334                  const char *name)
1335 {
1336    pool->name = name;
1337    pool->device = device;
1338    for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1339       util_sparse_array_free_list_init(&pool->free_list[i],
1340                                        &device->bo_cache.bo_map, 0,
1341                                        offsetof(struct anv_bo, free_index));
1342    }
1343 
1344    VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
1345 }
1346 
1347 void
anv_bo_pool_finish(struct anv_bo_pool * pool)1348 anv_bo_pool_finish(struct anv_bo_pool *pool)
1349 {
1350    for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1351       while (1) {
1352          struct anv_bo *bo =
1353             util_sparse_array_free_list_pop_elem(&pool->free_list[i]);
1354          if (bo == NULL)
1355             break;
1356 
1357          /* anv_device_release_bo is going to "free" it */
1358          VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1));
1359          anv_device_release_bo(pool->device, bo);
1360       }
1361    }
1362 
1363    VG(VALGRIND_DESTROY_MEMPOOL(pool));
1364 }
1365 
1366 VkResult
anv_bo_pool_alloc(struct anv_bo_pool * pool,uint32_t size,struct anv_bo ** bo_out)1367 anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
1368                   struct anv_bo **bo_out)
1369 {
1370    const unsigned size_log2 = size < 4096 ? 12 : ilog2_round_up(size);
1371    const unsigned pow2_size = 1 << size_log2;
1372    const unsigned bucket = size_log2 - 12;
1373    assert(bucket < ARRAY_SIZE(pool->free_list));
1374 
1375    struct anv_bo *bo =
1376       util_sparse_array_free_list_pop_elem(&pool->free_list[bucket]);
1377    if (bo != NULL) {
1378       VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1379       *bo_out = bo;
1380       return VK_SUCCESS;
1381    }
1382 
1383    VkResult result = anv_device_alloc_bo(pool->device,
1384                                          pool->name,
1385                                          pow2_size,
1386                                          ANV_BO_ALLOC_LOCAL_MEM |
1387                                          ANV_BO_ALLOC_MAPPED |
1388                                          ANV_BO_ALLOC_SNOOPED |
1389                                          ANV_BO_ALLOC_CAPTURE,
1390                                          0 /* explicit_address */,
1391                                          &bo);
1392    if (result != VK_SUCCESS)
1393       return result;
1394 
1395    /* We want it to look like it came from this pool */
1396    VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
1397    VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1398 
1399    *bo_out = bo;
1400 
1401    return VK_SUCCESS;
1402 }
1403 
1404 void
anv_bo_pool_free(struct anv_bo_pool * pool,struct anv_bo * bo)1405 anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo)
1406 {
1407    VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));
1408 
1409    assert(util_is_power_of_two_or_zero(bo->size));
1410    const unsigned size_log2 = ilog2_round_up(bo->size);
1411    const unsigned bucket = size_log2 - 12;
1412    assert(bucket < ARRAY_SIZE(pool->free_list));
1413 
1414    assert(util_sparse_array_get(&pool->device->bo_cache.bo_map,
1415                                 bo->gem_handle) == bo);
1416    util_sparse_array_free_list_push(&pool->free_list[bucket],
1417                                     &bo->gem_handle, 1);
1418 }
1419 
1420 // Scratch pool
1421 
1422 void
anv_scratch_pool_init(struct anv_device * device,struct anv_scratch_pool * pool)1423 anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool)
1424 {
1425    memset(pool, 0, sizeof(*pool));
1426 }
1427 
1428 void
anv_scratch_pool_finish(struct anv_device * device,struct anv_scratch_pool * pool)1429 anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool)
1430 {
1431    for (unsigned s = 0; s < ARRAY_SIZE(pool->bos[0]); s++) {
1432       for (unsigned i = 0; i < 16; i++) {
1433          if (pool->bos[i][s] != NULL)
1434             anv_device_release_bo(device, pool->bos[i][s]);
1435       }
1436    }
1437 
1438    for (unsigned i = 0; i < 16; i++) {
1439       if (pool->surf_states[i].map != NULL) {
1440          anv_state_pool_free(&device->surface_state_pool,
1441                              pool->surf_states[i]);
1442       }
1443    }
1444 }
1445 
1446 struct anv_bo *
anv_scratch_pool_alloc(struct anv_device * device,struct anv_scratch_pool * pool,gl_shader_stage stage,unsigned per_thread_scratch)1447 anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
1448                        gl_shader_stage stage, unsigned per_thread_scratch)
1449 {
1450    if (per_thread_scratch == 0)
1451       return NULL;
1452 
1453    unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1454    assert(scratch_size_log2 < 16);
1455 
1456    assert(stage < ARRAY_SIZE(pool->bos));
1457 
1458    const struct intel_device_info *devinfo = &device->info;
1459 
1460    /* On GFX version 12.5, scratch access changed to a surface-based model.
1461     * Instead of each shader type having its own layout based on IDs passed
1462     * from the relevant fixed-function unit, all scratch access is based on
1463     * thread IDs like it always has been for compute.
1464     */
1465    if (devinfo->verx10 >= 125)
1466       stage = MESA_SHADER_COMPUTE;
1467 
1468    struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]);
1469 
1470    if (bo != NULL)
1471       return bo;
1472 
1473    assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
1474    uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
1475 
1476    /* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they
1477     * are still relative to the general state base address.  When we emit
1478     * STATE_BASE_ADDRESS, we set general state base address to 0 and the size
1479     * to the maximum (1 page under 4GB).  This allows us to just place the
1480     * scratch buffers anywhere we wish in the bottom 32 bits of address space
1481     * and just set the scratch base pointer in 3DSTATE_*S using a relocation.
1482     * However, in order to do so, we need to ensure that the kernel does not
1483     * place the scratch BO above the 32-bit boundary.
1484     *
1485     * NOTE: Technically, it can't go "anywhere" because the top page is off
1486     * limits.  However, when EXEC_OBJECT_SUPPORTS_48B_ADDRESS is set, the
1487     * kernel allocates space using
1488     *
1489     *    end = min_t(u64, end, (1ULL << 32) - I915_GTT_PAGE_SIZE);
1490     *
1491     * so nothing will ever touch the top page.
1492     */
1493    VkResult result = anv_device_alloc_bo(device, "scratch", size,
1494                                          ANV_BO_ALLOC_32BIT_ADDRESS |
1495                                          ANV_BO_ALLOC_LOCAL_MEM,
1496                                          0 /* explicit_address */,
1497                                          &bo);
1498    if (result != VK_SUCCESS)
1499       return NULL; /* TODO */
1500 
1501    struct anv_bo *current_bo =
1502       p_atomic_cmpxchg(&pool->bos[scratch_size_log2][stage], NULL, bo);
1503    if (current_bo) {
1504       anv_device_release_bo(device, bo);
1505       return current_bo;
1506    } else {
1507       return bo;
1508    }
1509 }
1510 
1511 uint32_t
anv_scratch_pool_get_surf(struct anv_device * device,struct anv_scratch_pool * pool,unsigned per_thread_scratch)1512 anv_scratch_pool_get_surf(struct anv_device *device,
1513                           struct anv_scratch_pool *pool,
1514                           unsigned per_thread_scratch)
1515 {
1516    if (per_thread_scratch == 0)
1517       return 0;
1518 
1519    unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1520    assert(scratch_size_log2 < 16);
1521 
1522    uint32_t surf = p_atomic_read(&pool->surfs[scratch_size_log2]);
1523    if (surf > 0)
1524       return surf;
1525 
1526    struct anv_bo *bo =
1527       anv_scratch_pool_alloc(device, pool, MESA_SHADER_COMPUTE,
1528                              per_thread_scratch);
1529    struct anv_address addr = { .bo = bo };
1530 
1531    struct anv_state state =
1532       anv_state_pool_alloc(&device->surface_state_pool,
1533                            device->isl_dev.ss.size, 64);
1534 
1535    isl_buffer_fill_state(&device->isl_dev, state.map,
1536                          .address = anv_address_physical(addr),
1537                          .size_B = bo->size,
1538                          .mocs = anv_mocs(device, bo, 0),
1539                          .format = ISL_FORMAT_RAW,
1540                          .swizzle = ISL_SWIZZLE_IDENTITY,
1541                          .stride_B = per_thread_scratch,
1542                          .is_scratch = true);
1543 
1544    uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
1545                                        0, state.offset);
1546    if (current) {
1547       anv_state_pool_free(&device->surface_state_pool, state);
1548       return current;
1549    } else {
1550       pool->surf_states[scratch_size_log2] = state;
1551       return state.offset;
1552    }
1553 }
1554 
1555 VkResult
anv_bo_cache_init(struct anv_bo_cache * cache,struct anv_device * device)1556 anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device)
1557 {
1558    util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024);
1559 
1560    if (pthread_mutex_init(&cache->mutex, NULL)) {
1561       util_sparse_array_finish(&cache->bo_map);
1562       return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
1563                        "pthread_mutex_init failed: %m");
1564    }
1565 
1566    return VK_SUCCESS;
1567 }
1568 
1569 void
anv_bo_cache_finish(struct anv_bo_cache * cache)1570 anv_bo_cache_finish(struct anv_bo_cache *cache)
1571 {
1572    util_sparse_array_finish(&cache->bo_map);
1573    pthread_mutex_destroy(&cache->mutex);
1574 }
1575 
1576 #define ANV_BO_CACHE_SUPPORTED_FLAGS \
1577    (EXEC_OBJECT_WRITE | \
1578     EXEC_OBJECT_ASYNC | \
1579     EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \
1580     EXEC_OBJECT_PINNED | \
1581     EXEC_OBJECT_CAPTURE)
1582 
1583 static uint32_t
anv_bo_alloc_flags_to_bo_flags(struct anv_device * device,enum anv_bo_alloc_flags alloc_flags)1584 anv_bo_alloc_flags_to_bo_flags(struct anv_device *device,
1585                                enum anv_bo_alloc_flags alloc_flags)
1586 {
1587    struct anv_physical_device *pdevice = device->physical;
1588 
1589    uint64_t bo_flags = 0;
1590    if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS) &&
1591        pdevice->supports_48bit_addresses)
1592       bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
1593 
1594    if ((alloc_flags & ANV_BO_ALLOC_CAPTURE) && pdevice->has_exec_capture)
1595       bo_flags |= EXEC_OBJECT_CAPTURE;
1596 
1597    if (alloc_flags & ANV_BO_ALLOC_IMPLICIT_WRITE) {
1598       assert(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC);
1599       bo_flags |= EXEC_OBJECT_WRITE;
1600    }
1601 
1602    if (!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC) && pdevice->has_exec_async)
1603       bo_flags |= EXEC_OBJECT_ASYNC;
1604 
1605    if (pdevice->use_softpin)
1606       bo_flags |= EXEC_OBJECT_PINNED;
1607 
1608    return bo_flags;
1609 }
1610 
1611 static void
anv_bo_finish(struct anv_device * device,struct anv_bo * bo)1612 anv_bo_finish(struct anv_device *device, struct anv_bo *bo)
1613 {
1614    if (bo->offset != 0 && anv_bo_is_pinned(bo) && !bo->has_fixed_address)
1615       anv_vma_free(device, bo->offset, bo->size + bo->_ccs_size);
1616 
1617    if (bo->map && !bo->from_host_ptr)
1618       anv_device_unmap_bo(device, bo, bo->map, bo->size);
1619 
1620    assert(bo->gem_handle != 0);
1621    anv_gem_close(device, bo->gem_handle);
1622 }
1623 
1624 static VkResult
anv_bo_vma_alloc_or_close(struct anv_device * device,struct anv_bo * bo,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address)1625 anv_bo_vma_alloc_or_close(struct anv_device *device,
1626                           struct anv_bo *bo,
1627                           enum anv_bo_alloc_flags alloc_flags,
1628                           uint64_t explicit_address)
1629 {
1630    assert(anv_bo_is_pinned(bo));
1631    assert(explicit_address == intel_48b_address(explicit_address));
1632 
1633    uint32_t align = 4096;
1634 
1635    /* Gen12 CCS surface addresses need to be 64K aligned. */
1636    if (device->info.ver >= 12 && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS))
1637       align = 64 * 1024;
1638 
1639    /* For XeHP, lmem and smem cannot share a single PDE, which means they
1640     * can't live in the same 2MiB aligned region.
1641     */
1642    if (device->info.verx10 >= 125)
1643        align = 2 * 1024 * 1024;
1644 
1645    if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
1646       bo->has_fixed_address = true;
1647       bo->offset = explicit_address;
1648    } else {
1649       bo->offset = anv_vma_alloc(device, bo->size + bo->_ccs_size,
1650                                  align, alloc_flags, explicit_address);
1651       if (bo->offset == 0) {
1652          anv_bo_finish(device, bo);
1653          return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
1654                           "failed to allocate virtual address for BO");
1655       }
1656    }
1657 
1658    return VK_SUCCESS;
1659 }
1660 
1661 VkResult
anv_device_alloc_bo(struct anv_device * device,const char * name,uint64_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address,struct anv_bo ** bo_out)1662 anv_device_alloc_bo(struct anv_device *device,
1663                     const char *name,
1664                     uint64_t size,
1665                     enum anv_bo_alloc_flags alloc_flags,
1666                     uint64_t explicit_address,
1667                     struct anv_bo **bo_out)
1668 {
1669    if (!(alloc_flags & ANV_BO_ALLOC_LOCAL_MEM))
1670       anv_perf_warn(VK_LOG_NO_OBJS(&device->physical->instance->vk.base),
1671                                    "system memory used");
1672 
1673    if (!device->physical->has_implicit_ccs)
1674       assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
1675 
1676    const uint32_t bo_flags =
1677       anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
1678    assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
1679 
1680    /* The kernel is going to give us whole pages anyway */
1681    size = align_u64(size, 4096);
1682 
1683    uint64_t ccs_size = 0;
1684    if (device->info.has_aux_map && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) {
1685       /* Align the size up to the next multiple of 64K so we don't have any
1686        * AUX-TT entries pointing from a 64K page to itself.
1687        */
1688       size = align_u64(size, 64 * 1024);
1689 
1690       /* See anv_bo::_ccs_size */
1691       ccs_size = align_u64(DIV_ROUND_UP(size, INTEL_AUX_MAP_GFX12_CCS_SCALE), 4096);
1692    }
1693 
1694    uint32_t gem_handle;
1695 
1696    /* If we have vram size, we have multiple memory regions and should choose
1697     * one of them.
1698     */
1699    if (anv_physical_device_has_vram(device->physical)) {
1700       struct drm_i915_gem_memory_class_instance regions[2];
1701       uint32_t nregions = 0;
1702 
1703       if (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM) {
1704          /* vram_non_mappable & vram_mappable actually are the same region. */
1705          regions[nregions++] = device->physical->vram_non_mappable.region;
1706       } else {
1707          regions[nregions++] = device->physical->sys.region;
1708       }
1709 
1710       uint32_t flags = 0;
1711       if (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE) {
1712          assert(alloc_flags & ANV_BO_ALLOC_LOCAL_MEM);
1713          /* We're required to add smem as a region when using mappable vram. */
1714          regions[nregions++] = device->physical->sys.region;
1715          flags |= I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS;
1716       }
1717 
1718       gem_handle = anv_gem_create_regions(device, size + ccs_size,
1719                                           flags, nregions, regions);
1720    } else {
1721       gem_handle = anv_gem_create(device, size + ccs_size);
1722    }
1723 
1724    if (gem_handle == 0)
1725       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1726 
1727    struct anv_bo new_bo = {
1728       .name = name,
1729       .gem_handle = gem_handle,
1730       .refcount = 1,
1731       .offset = -1,
1732       .size = size,
1733       ._ccs_size = ccs_size,
1734       .flags = bo_flags,
1735       .is_external = (alloc_flags & ANV_BO_ALLOC_EXTERNAL),
1736       .has_client_visible_address =
1737          (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
1738       .has_implicit_ccs = ccs_size > 0 || (device->info.verx10 >= 125 &&
1739          (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM)),
1740    };
1741 
1742    if (alloc_flags & ANV_BO_ALLOC_MAPPED) {
1743       VkResult result = anv_device_map_bo(device, &new_bo, 0, size,
1744                                           0 /* gem_flags */, &new_bo.map);
1745       if (unlikely(result != VK_SUCCESS)) {
1746          anv_gem_close(device, new_bo.gem_handle);
1747          return result;
1748       }
1749    }
1750 
1751    if (alloc_flags & ANV_BO_ALLOC_SNOOPED) {
1752       assert(alloc_flags & ANV_BO_ALLOC_MAPPED);
1753       /* We don't want to change these defaults if it's going to be shared
1754        * with another process.
1755        */
1756       assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL));
1757 
1758       /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
1759        * I915_CACHING_NONE on non-LLC platforms.  For many internal state
1760        * objects, we'd rather take the snooping overhead than risk forgetting
1761        * a CLFLUSH somewhere.  Userptr objects are always created as
1762        * I915_CACHING_CACHED, which on non-LLC means snooped so there's no
1763        * need to do this there.
1764        */
1765       if (!device->info.has_llc) {
1766          anv_gem_set_caching(device, new_bo.gem_handle,
1767                              I915_CACHING_CACHED);
1768       }
1769    }
1770 
1771    if (anv_bo_is_pinned(&new_bo)) {
1772       VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1773                                                   alloc_flags,
1774                                                   explicit_address);
1775       if (result != VK_SUCCESS)
1776          return result;
1777    } else {
1778       assert(!new_bo.has_client_visible_address);
1779    }
1780 
1781    if (new_bo._ccs_size > 0) {
1782       assert(device->info.has_aux_map);
1783       intel_aux_map_add_mapping(device->aux_map_ctx,
1784                                 intel_canonical_address(new_bo.offset),
1785                                 intel_canonical_address(new_bo.offset + new_bo.size),
1786                                 new_bo.size, 0 /* format_bits */);
1787    }
1788 
1789    assert(new_bo.gem_handle);
1790 
1791    /* If we just got this gem_handle from anv_bo_init_new then we know no one
1792     * else is touching this BO at the moment so we don't need to lock here.
1793     */
1794    struct anv_bo *bo = anv_device_lookup_bo(device, new_bo.gem_handle);
1795    *bo = new_bo;
1796 
1797    *bo_out = bo;
1798 
1799    return VK_SUCCESS;
1800 }
1801 
1802 VkResult
anv_device_map_bo(struct anv_device * device,struct anv_bo * bo,uint64_t offset,size_t size,uint32_t gem_flags,void ** map_out)1803 anv_device_map_bo(struct anv_device *device,
1804                   struct anv_bo *bo,
1805                   uint64_t offset,
1806                   size_t size,
1807                   uint32_t gem_flags,
1808                   void **map_out)
1809 {
1810    assert(!bo->is_wrapper && !bo->from_host_ptr);
1811    assert(size > 0);
1812 
1813    void *map = anv_gem_mmap(device, bo->gem_handle, offset, size, gem_flags);
1814    if (unlikely(map == MAP_FAILED))
1815       return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
1816 
1817    assert(map != NULL);
1818 
1819    if (map_out)
1820       *map_out = map;
1821 
1822    return VK_SUCCESS;
1823 }
1824 
1825 void
anv_device_unmap_bo(struct anv_device * device,struct anv_bo * bo,void * map,size_t map_size)1826 anv_device_unmap_bo(struct anv_device *device,
1827                     struct anv_bo *bo,
1828                     void *map, size_t map_size)
1829 {
1830    assert(!bo->is_wrapper && !bo->from_host_ptr);
1831 
1832    anv_gem_munmap(device, map, map_size);
1833 }
1834 
1835 VkResult
anv_device_import_bo_from_host_ptr(struct anv_device * device,void * host_ptr,uint32_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1836 anv_device_import_bo_from_host_ptr(struct anv_device *device,
1837                                    void *host_ptr, uint32_t size,
1838                                    enum anv_bo_alloc_flags alloc_flags,
1839                                    uint64_t client_address,
1840                                    struct anv_bo **bo_out)
1841 {
1842    assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1843                            ANV_BO_ALLOC_SNOOPED |
1844                            ANV_BO_ALLOC_FIXED_ADDRESS)));
1845 
1846    assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS) ||
1847           (device->physical->has_implicit_ccs && device->info.has_aux_map));
1848 
1849    struct anv_bo_cache *cache = &device->bo_cache;
1850    const uint32_t bo_flags =
1851       anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
1852    assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
1853 
1854    uint32_t gem_handle = anv_gem_userptr(device, host_ptr, size);
1855    if (!gem_handle)
1856       return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1857 
1858    pthread_mutex_lock(&cache->mutex);
1859 
1860    struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
1861    if (bo->refcount > 0) {
1862       /* VK_EXT_external_memory_host doesn't require handling importing the
1863        * same pointer twice at the same time, but we don't get in the way.  If
1864        * kernel gives us the same gem_handle, only succeed if the flags match.
1865        */
1866       assert(bo->gem_handle == gem_handle);
1867       if (bo_flags != bo->flags) {
1868          pthread_mutex_unlock(&cache->mutex);
1869          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1870                           "same host pointer imported two different ways");
1871       }
1872 
1873       if (bo->has_client_visible_address !=
1874           ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {
1875          pthread_mutex_unlock(&cache->mutex);
1876          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1877                           "The same BO was imported with and without buffer "
1878                           "device address");
1879       }
1880 
1881       if (client_address && client_address != intel_48b_address(bo->offset)) {
1882          pthread_mutex_unlock(&cache->mutex);
1883          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1884                           "The same BO was imported at two different "
1885                           "addresses");
1886       }
1887 
1888       __sync_fetch_and_add(&bo->refcount, 1);
1889    } else {
1890       struct anv_bo new_bo = {
1891          .name = "host-ptr",
1892          .gem_handle = gem_handle,
1893          .refcount = 1,
1894          .offset = -1,
1895          .size = size,
1896          .map = host_ptr,
1897          .flags = bo_flags,
1898          .is_external = true,
1899          .from_host_ptr = true,
1900          .has_client_visible_address =
1901             (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
1902       };
1903 
1904       if (anv_bo_is_pinned(&new_bo)) {
1905          VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1906                                                      alloc_flags,
1907                                                      client_address);
1908          if (result != VK_SUCCESS) {
1909             pthread_mutex_unlock(&cache->mutex);
1910             return result;
1911          }
1912       } else {
1913          assert(!new_bo.has_client_visible_address);
1914       }
1915 
1916       *bo = new_bo;
1917    }
1918 
1919    pthread_mutex_unlock(&cache->mutex);
1920    *bo_out = bo;
1921 
1922    return VK_SUCCESS;
1923 }
1924 
1925 VkResult
anv_device_import_bo(struct anv_device * device,int fd,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1926 anv_device_import_bo(struct anv_device *device,
1927                      int fd,
1928                      enum anv_bo_alloc_flags alloc_flags,
1929                      uint64_t client_address,
1930                      struct anv_bo **bo_out)
1931 {
1932    assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1933                            ANV_BO_ALLOC_SNOOPED |
1934                            ANV_BO_ALLOC_FIXED_ADDRESS)));
1935 
1936    assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS) ||
1937           (device->physical->has_implicit_ccs && device->info.has_aux_map));
1938 
1939    struct anv_bo_cache *cache = &device->bo_cache;
1940    const uint32_t bo_flags =
1941       anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
1942    assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
1943 
1944    pthread_mutex_lock(&cache->mutex);
1945 
1946    uint32_t gem_handle = anv_gem_fd_to_handle(device, fd);
1947    if (!gem_handle) {
1948       pthread_mutex_unlock(&cache->mutex);
1949       return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1950    }
1951 
1952    struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
1953    if (bo->refcount > 0) {
1954       /* We have to be careful how we combine flags so that it makes sense.
1955        * Really, though, if we get to this case and it actually matters, the
1956        * client has imported a BO twice in different ways and they get what
1957        * they have coming.
1958        */
1959       uint64_t new_flags = 0;
1960       new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_WRITE;
1961       new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_ASYNC;
1962       new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
1963       new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_PINNED;
1964       new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_CAPTURE;
1965 
1966       /* It's theoretically possible for a BO to get imported such that it's
1967        * both pinned and not pinned.  The only way this can happen is if it
1968        * gets imported as both a semaphore and a memory object and that would
1969        * be an application error.  Just fail out in that case.
1970        */
1971       if ((bo->flags & EXEC_OBJECT_PINNED) !=
1972           (bo_flags & EXEC_OBJECT_PINNED)) {
1973          pthread_mutex_unlock(&cache->mutex);
1974          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1975                           "The same BO was imported two different ways");
1976       }
1977 
1978       /* It's also theoretically possible that someone could export a BO from
1979        * one heap and import it into another or to import the same BO into two
1980        * different heaps.  If this happens, we could potentially end up both
1981        * allowing and disallowing 48-bit addresses.  There's not much we can
1982        * do about it if we're pinning so we just throw an error and hope no
1983        * app is actually that stupid.
1984        */
1985       if ((new_flags & EXEC_OBJECT_PINNED) &&
1986           (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) !=
1987           (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) {
1988          pthread_mutex_unlock(&cache->mutex);
1989          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1990                           "The same BO was imported on two different heaps");
1991       }
1992 
1993       if (bo->has_client_visible_address !=
1994           ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {
1995          pthread_mutex_unlock(&cache->mutex);
1996          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1997                           "The same BO was imported with and without buffer "
1998                           "device address");
1999       }
2000 
2001       if (client_address && client_address != intel_48b_address(bo->offset)) {
2002          pthread_mutex_unlock(&cache->mutex);
2003          return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
2004                           "The same BO was imported at two different "
2005                           "addresses");
2006       }
2007 
2008       bo->flags = new_flags;
2009 
2010       __sync_fetch_and_add(&bo->refcount, 1);
2011    } else {
2012       off_t size = lseek(fd, 0, SEEK_END);
2013       if (size == (off_t)-1) {
2014          anv_gem_close(device, gem_handle);
2015          pthread_mutex_unlock(&cache->mutex);
2016          return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
2017       }
2018 
2019       struct anv_bo new_bo = {
2020          .name = "imported",
2021          .gem_handle = gem_handle,
2022          .refcount = 1,
2023          .offset = -1,
2024          .size = size,
2025          .flags = bo_flags,
2026          .is_external = true,
2027          .has_client_visible_address =
2028             (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
2029       };
2030 
2031       if (anv_bo_is_pinned(&new_bo)) {
2032          assert(new_bo._ccs_size == 0);
2033          VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
2034                                                      alloc_flags,
2035                                                      client_address);
2036          if (result != VK_SUCCESS) {
2037             pthread_mutex_unlock(&cache->mutex);
2038             return result;
2039          }
2040       } else {
2041          assert(!new_bo.has_client_visible_address);
2042       }
2043 
2044       *bo = new_bo;
2045    }
2046 
2047    pthread_mutex_unlock(&cache->mutex);
2048    *bo_out = bo;
2049 
2050    return VK_SUCCESS;
2051 }
2052 
2053 VkResult
anv_device_export_bo(struct anv_device * device,struct anv_bo * bo,int * fd_out)2054 anv_device_export_bo(struct anv_device *device,
2055                      struct anv_bo *bo, int *fd_out)
2056 {
2057    assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
2058 
2059    /* This BO must have been flagged external in order for us to be able
2060     * to export it.  This is done based on external options passed into
2061     * anv_AllocateMemory.
2062     */
2063    assert(bo->is_external);
2064 
2065    int fd = anv_gem_handle_to_fd(device, bo->gem_handle);
2066    if (fd < 0)
2067       return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
2068 
2069    *fd_out = fd;
2070 
2071    return VK_SUCCESS;
2072 }
2073 
2074 VkResult
anv_device_get_bo_tiling(struct anv_device * device,struct anv_bo * bo,enum isl_tiling * tiling_out)2075 anv_device_get_bo_tiling(struct anv_device *device,
2076                          struct anv_bo *bo,
2077                          enum isl_tiling *tiling_out)
2078 {
2079    int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
2080    if (i915_tiling < 0) {
2081       return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
2082                        "failed to get BO tiling: %m");
2083    }
2084 
2085    *tiling_out = isl_tiling_from_i915_tiling(i915_tiling);
2086 
2087    return VK_SUCCESS;
2088 }
2089 
2090 VkResult
anv_device_set_bo_tiling(struct anv_device * device,struct anv_bo * bo,uint32_t row_pitch_B,enum isl_tiling tiling)2091 anv_device_set_bo_tiling(struct anv_device *device,
2092                          struct anv_bo *bo,
2093                          uint32_t row_pitch_B,
2094                          enum isl_tiling tiling)
2095 {
2096    int ret = anv_gem_set_tiling(device, bo->gem_handle, row_pitch_B,
2097                                 isl_tiling_to_i915_tiling(tiling));
2098    if (ret) {
2099       return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
2100                        "failed to set BO tiling: %m");
2101    }
2102 
2103    return VK_SUCCESS;
2104 }
2105 
2106 static bool
atomic_dec_not_one(uint32_t * counter)2107 atomic_dec_not_one(uint32_t *counter)
2108 {
2109    uint32_t old, val;
2110 
2111    val = *counter;
2112    while (1) {
2113       if (val == 1)
2114          return false;
2115 
2116       old = __sync_val_compare_and_swap(counter, val, val - 1);
2117       if (old == val)
2118          return true;
2119 
2120       val = old;
2121    }
2122 }
2123 
2124 void
anv_device_release_bo(struct anv_device * device,struct anv_bo * bo)2125 anv_device_release_bo(struct anv_device *device,
2126                       struct anv_bo *bo)
2127 {
2128    struct anv_bo_cache *cache = &device->bo_cache;
2129    assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
2130 
2131    /* Try to decrement the counter but don't go below one.  If this succeeds
2132     * then the refcount has been decremented and we are not the last
2133     * reference.
2134     */
2135    if (atomic_dec_not_one(&bo->refcount))
2136       return;
2137 
2138    pthread_mutex_lock(&cache->mutex);
2139 
2140    /* We are probably the last reference since our attempt to decrement above
2141     * failed.  However, we can't actually know until we are inside the mutex.
2142     * Otherwise, someone could import the BO between the decrement and our
2143     * taking the mutex.
2144     */
2145    if (unlikely(__sync_sub_and_fetch(&bo->refcount, 1) > 0)) {
2146       /* Turns out we're not the last reference.  Unlock and bail. */
2147       pthread_mutex_unlock(&cache->mutex);
2148       return;
2149    }
2150    assert(bo->refcount == 0);
2151 
2152    if (bo->_ccs_size > 0) {
2153       assert(device->physical->has_implicit_ccs);
2154       assert(device->info.has_aux_map);
2155       assert(bo->has_implicit_ccs);
2156       intel_aux_map_unmap_range(device->aux_map_ctx,
2157                                 intel_canonical_address(bo->offset),
2158                                 bo->size);
2159    }
2160 
2161    /* Memset the BO just in case.  The refcount being zero should be enough to
2162     * prevent someone from assuming the data is valid but it's safer to just
2163     * stomp to zero just in case.  We explicitly do this *before* we actually
2164     * close the GEM handle to ensure that if anyone allocates something and
2165     * gets the same GEM handle, the memset has already happen and won't stomp
2166     * all over any data they may write in this BO.
2167     */
2168    struct anv_bo old_bo = *bo;
2169 
2170    memset(bo, 0, sizeof(*bo));
2171 
2172    anv_bo_finish(device, &old_bo);
2173 
2174    /* Don't unlock until we've actually closed the BO.  The whole point of
2175     * the BO cache is to ensure that we correctly handle races with creating
2176     * and releasing GEM handles and we don't want to let someone import the BO
2177     * again between mutex unlock and closing the GEM handle.
2178     */
2179    pthread_mutex_unlock(&cache->mutex);
2180 }
2181