1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <stdlib.h>
25 #include <unistd.h>
26 #include <limits.h>
27 #include <assert.h>
28 #include <sys/mman.h>
29
30 #include "anv_private.h"
31
32 #include "common/intel_aux_map.h"
33 #include "util/anon_file.h"
34 #include "util/futex.h"
35
36 #ifdef HAVE_VALGRIND
37 #define VG_NOACCESS_READ(__ptr) ({ \
38 VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \
39 __typeof(*(__ptr)) __val = *(__ptr); \
40 VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\
41 __val; \
42 })
43 #define VG_NOACCESS_WRITE(__ptr, __val) ({ \
44 VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr))); \
45 *(__ptr) = (__val); \
46 VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr))); \
47 })
48 #else
49 #define VG_NOACCESS_READ(__ptr) (*(__ptr))
50 #define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val))
51 #endif
52
53 #ifndef MAP_POPULATE
54 #define MAP_POPULATE 0
55 #endif
56
57 /* Design goals:
58 *
59 * - Lock free (except when resizing underlying bos)
60 *
61 * - Constant time allocation with typically only one atomic
62 *
63 * - Multiple allocation sizes without fragmentation
64 *
65 * - Can grow while keeping addresses and offset of contents stable
66 *
67 * - All allocations within one bo so we can point one of the
68 * STATE_BASE_ADDRESS pointers at it.
69 *
70 * The overall design is a two-level allocator: top level is a fixed size, big
71 * block (8k) allocator, which operates out of a bo. Allocation is done by
72 * either pulling a block from the free list or growing the used range of the
73 * bo. Growing the range may run out of space in the bo which we then need to
74 * grow. Growing the bo is tricky in a multi-threaded, lockless environment:
75 * we need to keep all pointers and contents in the old map valid. GEM bos in
76 * general can't grow, but we use a trick: we create a memfd and use ftruncate
77 * to grow it as necessary. We mmap the new size and then create a gem bo for
78 * it using the new gem userptr ioctl. Without heavy-handed locking around
79 * our allocation fast-path, there isn't really a way to munmap the old mmap,
80 * so we just keep it around until garbage collection time. While the block
81 * allocator is lockless for normal operations, we block other threads trying
82 * to allocate while we're growing the map. It shouldn't happen often, and
83 * growing is fast anyway.
84 *
85 * At the next level we can use various sub-allocators. The state pool is a
86 * pool of smaller, fixed size objects, which operates much like the block
87 * pool. It uses a free list for freeing objects, but when it runs out of
88 * space it just allocates a new block from the block pool. This allocator is
89 * intended for longer lived state objects such as SURFACE_STATE and most
90 * other persistent state objects in the API. We may need to track more info
91 * with these object and a pointer back to the CPU object (eg VkImage). In
92 * those cases we just allocate a slightly bigger object and put the extra
93 * state after the GPU state object.
94 *
95 * The state stream allocator works similar to how the i965 DRI driver streams
96 * all its state. Even with Vulkan, we need to emit transient state (whether
97 * surface state base or dynamic state base), and for that we can just get a
98 * block and fill it up. These cases are local to a command buffer and the
99 * sub-allocator need not be thread safe. The streaming allocator gets a new
100 * block when it runs out of space and chains them together so they can be
101 * easily freed.
102 */
103
104 /* Allocations are always at least 64 byte aligned, so 1 is an invalid value.
105 * We use it to indicate the free list is empty. */
106 #define EMPTY UINT32_MAX
107
108 /* On FreeBSD PAGE_SIZE is already defined in
109 * /usr/include/machine/param.h that is indirectly
110 * included here.
111 */
112 #ifndef PAGE_SIZE
113 #define PAGE_SIZE 4096
114 #endif
115
116 struct anv_state_table_cleanup {
117 void *map;
118 size_t size;
119 };
120
121 #define ANV_STATE_TABLE_CLEANUP_INIT ((struct anv_state_table_cleanup){0})
122 #define ANV_STATE_ENTRY_SIZE (sizeof(struct anv_free_entry))
123
124 static VkResult
125 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size);
126
127 VkResult
anv_state_table_init(struct anv_state_table * table,struct anv_device * device,uint32_t initial_entries)128 anv_state_table_init(struct anv_state_table *table,
129 struct anv_device *device,
130 uint32_t initial_entries)
131 {
132 VkResult result;
133
134 table->device = device;
135
136 /* Just make it 2GB up-front. The Linux kernel won't actually back it
137 * with pages until we either map and fault on one of them or we use
138 * userptr and send a chunk of it off to the GPU.
139 */
140 table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table");
141 if (table->fd == -1)
142 return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
143
144 if (!u_vector_init(&table->cleanups, 8,
145 sizeof(struct anv_state_table_cleanup))) {
146 result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
147 goto fail_fd;
148 }
149
150 table->state.next = 0;
151 table->state.end = 0;
152 table->size = 0;
153
154 uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;
155 result = anv_state_table_expand_range(table, initial_size);
156 if (result != VK_SUCCESS)
157 goto fail_cleanups;
158
159 return VK_SUCCESS;
160
161 fail_cleanups:
162 u_vector_finish(&table->cleanups);
163 fail_fd:
164 close(table->fd);
165
166 return result;
167 }
168
169 static VkResult
anv_state_table_expand_range(struct anv_state_table * table,uint32_t size)170 anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
171 {
172 void *map;
173 struct anv_state_table_cleanup *cleanup;
174
175 /* Assert that we only ever grow the pool */
176 assert(size >= table->state.end);
177
178 /* Make sure that we don't go outside the bounds of the memfd */
179 if (size > BLOCK_POOL_MEMFD_SIZE)
180 return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
181
182 cleanup = u_vector_add(&table->cleanups);
183 if (!cleanup)
184 return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
185
186 *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
187
188 /* Just leak the old map until we destroy the pool. We can't munmap it
189 * without races or imposing locking on the block allocate fast path. On
190 * the whole the leaked maps adds up to less than the size of the
191 * current map. MAP_POPULATE seems like the right thing to do, but we
192 * should try to get some numbers.
193 */
194 map = mmap(NULL, size, PROT_READ | PROT_WRITE,
195 MAP_SHARED | MAP_POPULATE, table->fd, 0);
196 if (map == MAP_FAILED) {
197 return vk_errorf(table->device, VK_ERROR_OUT_OF_HOST_MEMORY,
198 "mmap failed: %m");
199 }
200
201 cleanup->map = map;
202 cleanup->size = size;
203
204 table->map = map;
205 table->size = size;
206
207 return VK_SUCCESS;
208 }
209
210 static VkResult
anv_state_table_grow(struct anv_state_table * table)211 anv_state_table_grow(struct anv_state_table *table)
212 {
213 VkResult result = VK_SUCCESS;
214
215 uint32_t used = align(table->state.next * ANV_STATE_ENTRY_SIZE,
216 PAGE_SIZE);
217 uint32_t old_size = table->size;
218
219 /* The block pool is always initialized to a nonzero size and this function
220 * is always called after initialization.
221 */
222 assert(old_size > 0);
223
224 uint32_t required = MAX2(used, old_size);
225 if (used * 2 <= required) {
226 /* If we're in this case then this isn't the firsta allocation and we
227 * already have enough space on both sides to hold double what we
228 * have allocated. There's nothing for us to do.
229 */
230 goto done;
231 }
232
233 uint32_t size = old_size * 2;
234 while (size < required)
235 size *= 2;
236
237 assert(size > table->size);
238
239 result = anv_state_table_expand_range(table, size);
240
241 done:
242 return result;
243 }
244
245 void
anv_state_table_finish(struct anv_state_table * table)246 anv_state_table_finish(struct anv_state_table *table)
247 {
248 struct anv_state_table_cleanup *cleanup;
249
250 u_vector_foreach(cleanup, &table->cleanups) {
251 if (cleanup->map)
252 munmap(cleanup->map, cleanup->size);
253 }
254
255 u_vector_finish(&table->cleanups);
256
257 close(table->fd);
258 }
259
260 VkResult
anv_state_table_add(struct anv_state_table * table,uint32_t * idx,uint32_t count)261 anv_state_table_add(struct anv_state_table *table, uint32_t *idx,
262 uint32_t count)
263 {
264 struct anv_block_state state, old, new;
265 VkResult result;
266
267 assert(idx);
268
269 while(1) {
270 state.u64 = __sync_fetch_and_add(&table->state.u64, count);
271 if (state.next + count <= state.end) {
272 assert(table->map);
273 struct anv_free_entry *entry = &table->map[state.next];
274 for (int i = 0; i < count; i++) {
275 entry[i].state.idx = state.next + i;
276 }
277 *idx = state.next;
278 return VK_SUCCESS;
279 } else if (state.next <= state.end) {
280 /* We allocated the first block outside the pool so we have to grow
281 * the pool. pool_state->next acts a mutex: threads who try to
282 * allocate now will get block indexes above the current limit and
283 * hit futex_wait below.
284 */
285 new.next = state.next + count;
286 do {
287 result = anv_state_table_grow(table);
288 if (result != VK_SUCCESS)
289 return result;
290 new.end = table->size / ANV_STATE_ENTRY_SIZE;
291 } while (new.end < new.next);
292
293 old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64);
294 if (old.next != state.next)
295 futex_wake(&table->state.end, INT_MAX);
296 } else {
297 futex_wait(&table->state.end, state.end, NULL);
298 continue;
299 }
300 }
301 }
302
303 void
anv_free_list_push(union anv_free_list * list,struct anv_state_table * table,uint32_t first,uint32_t count)304 anv_free_list_push(union anv_free_list *list,
305 struct anv_state_table *table,
306 uint32_t first, uint32_t count)
307 {
308 union anv_free_list current, old, new;
309 uint32_t last = first;
310
311 for (uint32_t i = 1; i < count; i++, last++)
312 table->map[last].next = last + 1;
313
314 old.u64 = list->u64;
315 do {
316 current = old;
317 table->map[last].next = current.offset;
318 new.offset = first;
319 new.count = current.count + 1;
320 old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
321 } while (old.u64 != current.u64);
322 }
323
324 struct anv_state *
anv_free_list_pop(union anv_free_list * list,struct anv_state_table * table)325 anv_free_list_pop(union anv_free_list *list,
326 struct anv_state_table *table)
327 {
328 union anv_free_list current, new, old;
329
330 current.u64 = list->u64;
331 while (current.offset != EMPTY) {
332 __sync_synchronize();
333 new.offset = table->map[current.offset].next;
334 new.count = current.count + 1;
335 old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
336 if (old.u64 == current.u64) {
337 struct anv_free_entry *entry = &table->map[current.offset];
338 return &entry->state;
339 }
340 current = old;
341 }
342
343 return NULL;
344 }
345
346 static VkResult
347 anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size);
348
349 VkResult
anv_block_pool_init(struct anv_block_pool * pool,struct anv_device * device,const char * name,uint64_t start_address,uint32_t initial_size,uint32_t max_size)350 anv_block_pool_init(struct anv_block_pool *pool,
351 struct anv_device *device,
352 const char *name,
353 uint64_t start_address,
354 uint32_t initial_size,
355 uint32_t max_size)
356 {
357 VkResult result;
358
359 /* Make sure VMA addresses are aligned for the block pool */
360 assert(anv_is_aligned(start_address, device->info->mem_alignment));
361 assert(anv_is_aligned(initial_size, device->info->mem_alignment));
362 assert(max_size > 0);
363 assert(max_size > initial_size);
364
365 pool->name = name;
366 pool->device = device;
367 pool->nbos = 0;
368 pool->size = 0;
369 pool->start_address = intel_canonical_address(start_address);
370 pool->max_size = max_size;
371
372 pool->bo = NULL;
373
374 pool->state.next = 0;
375 pool->state.end = 0;
376
377 pool->bo_alloc_flags =
378 ANV_BO_ALLOC_FIXED_ADDRESS |
379 ANV_BO_ALLOC_MAPPED |
380 ANV_BO_ALLOC_HOST_CACHED_COHERENT |
381 ANV_BO_ALLOC_CAPTURE |
382 ANV_BO_ALLOC_INTERNAL;
383
384 result = anv_block_pool_expand_range(pool, initial_size);
385 if (result != VK_SUCCESS)
386 return result;
387
388 /* Make the entire pool available in the front of the pool. If back
389 * allocation needs to use this space, the "ends" will be re-arranged.
390 */
391 pool->state.end = pool->size;
392
393 return VK_SUCCESS;
394 }
395
396 void
anv_block_pool_finish(struct anv_block_pool * pool)397 anv_block_pool_finish(struct anv_block_pool *pool)
398 {
399 anv_block_pool_foreach_bo(bo, pool) {
400 assert(bo->refcount == 1);
401 anv_device_release_bo(pool->device, bo);
402 }
403 }
404
405 static VkResult
anv_block_pool_expand_range(struct anv_block_pool * pool,uint32_t size)406 anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size)
407 {
408 /* Assert that we only ever grow the pool */
409 assert(size >= pool->state.end);
410
411 /* For state pool BOs we have to be a bit careful about where we place them
412 * in the GTT. There are two documented workarounds for state base address
413 * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset
414 * which state that those two base addresses do not support 48-bit
415 * addresses and need to be placed in the bottom 32-bit range.
416 * Unfortunately, this is not quite accurate.
417 *
418 * The real problem is that we always set the size of our state pools in
419 * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most
420 * likely significantly smaller. We do this because we do not no at the
421 * time we emit STATE_BASE_ADDRESS whether or not we will need to expand
422 * the pool during command buffer building so we don't actually have a
423 * valid final size. If the address + size, as seen by STATE_BASE_ADDRESS
424 * overflows 48 bits, the GPU appears to treat all accesses to the buffer
425 * as being out of bounds and returns zero. For dynamic state, this
426 * usually just leads to rendering corruptions, but shaders that are all
427 * zero hang the GPU immediately.
428 *
429 * The easiest solution to do is exactly what the bogus workarounds say to
430 * do: restrict these buffers to 32-bit addresses. We could also pin the
431 * BO to some particular location of our choosing, but that's significantly
432 * more work than just not setting a flag. So, we explicitly DO NOT set
433 * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the
434 * hard work for us. When using softpin, we're in control and the fixed
435 * addresses we choose are fine for base addresses.
436 */
437
438 uint32_t new_bo_size = size - pool->size;
439 struct anv_bo *new_bo = NULL;
440 VkResult result = anv_device_alloc_bo(pool->device,
441 pool->name,
442 new_bo_size,
443 pool->bo_alloc_flags,
444 intel_48b_address(pool->start_address + pool->size),
445 &new_bo);
446 if (result != VK_SUCCESS)
447 return result;
448
449 pool->bos[pool->nbos++] = new_bo;
450
451 /* This pointer will always point to the first BO in the list */
452 pool->bo = pool->bos[0];
453
454 assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS);
455 pool->size = size;
456
457 return VK_SUCCESS;
458 }
459
460 /** Returns current memory map of the block pool.
461 *
462 * The returned pointer points to the map for the memory at the specified
463 * offset. The offset parameter is relative to the "center" of the block pool
464 * rather than the start of the block pool BO map.
465 */
466 void*
anv_block_pool_map(struct anv_block_pool * pool,int32_t offset,uint32_t size)467 anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
468 {
469 struct anv_bo *bo = NULL;
470 int32_t bo_offset = 0;
471 anv_block_pool_foreach_bo(iter_bo, pool) {
472 if (offset < bo_offset + iter_bo->size) {
473 bo = iter_bo;
474 break;
475 }
476 bo_offset += iter_bo->size;
477 }
478 assert(bo != NULL);
479 assert(offset >= bo_offset);
480 assert((offset - bo_offset) + size <= bo->size);
481
482 return bo->map + (offset - bo_offset);
483 }
484
485 /** Grows and re-centers the block pool.
486 *
487 * We grow the block pool in one or both directions in such a way that the
488 * following conditions are met:
489 *
490 * 1) The size of the entire pool is always a power of two.
491 *
492 * 2) The pool only grows on both ends. Neither end can get
493 * shortened.
494 *
495 * 3) At the end of the allocation, we have about twice as much space
496 * allocated for each end as we have used. This way the pool doesn't
497 * grow too far in one direction or the other.
498 *
499 * 4) We have enough space allocated for at least one more block in
500 * whichever side `state` points to.
501 *
502 * 5) The center of the pool is always aligned to both the block_size of
503 * the pool and a 4K CPU page.
504 */
505 static uint32_t
anv_block_pool_grow(struct anv_block_pool * pool,struct anv_block_state * state,uint32_t contiguous_size)506 anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
507 uint32_t contiguous_size)
508 {
509 VkResult result = VK_SUCCESS;
510
511 pthread_mutex_lock(&pool->device->mutex);
512
513 assert(state == &pool->state);
514
515 /* Gather a little usage information on the pool. Since we may have
516 * threads waiting in queue to get some storage while we resize, it's
517 * actually possible that total_used will be larger than old_size. In
518 * particular, block_pool_alloc() increments state->next prior to
519 * calling block_pool_grow, so this ensures that we get enough space for
520 * which ever side tries to grow the pool.
521 *
522 * We align to a page size because it makes it easier to do our
523 * calculations later in such a way that we state page-aigned.
524 */
525 uint32_t total_used = align(pool->state.next, PAGE_SIZE);
526
527 uint32_t old_size = pool->size;
528
529 /* The block pool is always initialized to a nonzero size and this function
530 * is always called after initialization.
531 */
532 assert(old_size > 0);
533
534 /* total_used may actually be smaller than the actual requirement because
535 * they are based on the next pointers which are updated prior to calling
536 * this function.
537 */
538 uint32_t required = MAX2(total_used, old_size);
539
540 /* With softpin, the pool is made up of a bunch of buffers with separate
541 * maps. Make sure we have enough contiguous space that we can get a
542 * properly contiguous map for the next chunk.
543 */
544 required = MAX2(required, old_size + contiguous_size);
545
546 if (required > pool->max_size) {
547 result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
548 } else if (total_used * 2 > required) {
549 uint32_t size = old_size * 2;
550 while (size < required)
551 size *= 2;
552
553 size = MIN2(size, pool->max_size);
554 assert(size > pool->size);
555
556 result = anv_block_pool_expand_range(pool, size);
557 }
558
559 pthread_mutex_unlock(&pool->device->mutex);
560
561 if (result != VK_SUCCESS)
562 return 0;
563
564 /* Return the appropriate new size. This function never actually
565 * updates state->next. Instead, we let the caller do that because it
566 * needs to do so in order to maintain its concurrency model.
567 */
568 return pool->size;
569 }
570
571 static VkResult
anv_block_pool_alloc_new(struct anv_block_pool * pool,struct anv_block_state * pool_state,uint32_t block_size,int64_t * offset,uint32_t * padding)572 anv_block_pool_alloc_new(struct anv_block_pool *pool,
573 struct anv_block_state *pool_state,
574 uint32_t block_size,
575 int64_t *offset,
576 uint32_t *padding)
577 {
578 struct anv_block_state state, old, new;
579
580 /* Most allocations won't generate any padding */
581 if (padding)
582 *padding = 0;
583
584 while (1) {
585 state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);
586 if (state.next + block_size > pool->max_size) {
587 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
588 } else if (state.next + block_size <= state.end) {
589 *offset = state.next;
590 return VK_SUCCESS;
591 } else if (state.next <= state.end) {
592 if (state.next < state.end) {
593 /* We need to grow the block pool, but still have some leftover
594 * space that can't be used by that particular allocation. So we
595 * add that as a "padding", and return it.
596 */
597 uint32_t leftover = state.end - state.next;
598
599 /* If there is some leftover space in the pool, the caller must
600 * deal with it.
601 */
602 assert(leftover == 0 || padding);
603 if (padding)
604 *padding = leftover;
605 state.next += leftover;
606 }
607
608 /* We allocated the first block outside the pool so we have to grow
609 * the pool. pool_state->next acts a mutex: threads who try to
610 * allocate now will get block indexes above the current limit and
611 * hit futex_wait below.
612 */
613 new.next = state.next + block_size;
614 do {
615 new.end = anv_block_pool_grow(pool, pool_state, block_size);
616 if (pool->size > 0 && new.end == 0) {
617 futex_wake(&pool_state->end, INT_MAX);
618 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
619 }
620 } while (new.end < new.next);
621
622 old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
623 if (old.next != state.next)
624 futex_wake(&pool_state->end, INT_MAX);
625 *offset = state.next;
626 return VK_SUCCESS;
627 } else {
628 futex_wait(&pool_state->end, state.end, NULL);
629 continue;
630 }
631 }
632 }
633
634 VkResult
anv_block_pool_alloc(struct anv_block_pool * pool,uint32_t block_size,int64_t * offset,uint32_t * padding)635 anv_block_pool_alloc(struct anv_block_pool *pool,
636 uint32_t block_size,
637 int64_t *offset, uint32_t *padding)
638 {
639 return anv_block_pool_alloc_new(pool, &pool->state, block_size, offset, padding);
640 }
641
642 VkResult
anv_state_pool_init(struct anv_state_pool * pool,struct anv_device * device,const struct anv_state_pool_params * params)643 anv_state_pool_init(struct anv_state_pool *pool,
644 struct anv_device *device,
645 const struct anv_state_pool_params *params)
646 {
647 uint32_t initial_size = MAX2(params->block_size * 16,
648 device->info->mem_alignment);
649
650 VkResult result = anv_block_pool_init(&pool->block_pool, device,
651 params->name,
652 params->base_address + params->start_offset,
653 initial_size,
654 params->max_size);
655 if (result != VK_SUCCESS)
656 return result;
657
658 pool->start_offset = params->start_offset;
659
660 result = anv_state_table_init(&pool->table, device, 64);
661 if (result != VK_SUCCESS) {
662 anv_block_pool_finish(&pool->block_pool);
663 return result;
664 }
665
666 assert(util_is_power_of_two_or_zero(params->block_size));
667 pool->block_size = params->block_size;
668 for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
669 pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;
670 pool->buckets[i].block.next = 0;
671 pool->buckets[i].block.end = 0;
672 }
673 VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
674
675 return VK_SUCCESS;
676 }
677
678 void
anv_state_pool_finish(struct anv_state_pool * pool)679 anv_state_pool_finish(struct anv_state_pool *pool)
680 {
681 VG(VALGRIND_DESTROY_MEMPOOL(pool));
682 anv_state_table_finish(&pool->table);
683 anv_block_pool_finish(&pool->block_pool);
684 }
685
686 static VkResult
anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool * pool,struct anv_block_pool * block_pool,uint32_t state_size,uint32_t block_size,int64_t * offset,uint32_t * padding)687 anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
688 struct anv_block_pool *block_pool,
689 uint32_t state_size,
690 uint32_t block_size,
691 int64_t *offset,
692 uint32_t *padding)
693 {
694 struct anv_block_state block, old, new;
695
696 /* We don't always use anv_block_pool_alloc(), which would set *padding to
697 * zero for us. So if we have a pointer to padding, we must zero it out
698 * ourselves here, to make sure we always return some sensible value.
699 */
700 if (padding)
701 *padding = 0;
702
703 /* If our state is large, we don't need any sub-allocation from a block.
704 * Instead, we just grab whole (potentially large) blocks.
705 */
706 if (state_size >= block_size)
707 return anv_block_pool_alloc(block_pool, state_size, offset, padding);
708
709 restart:
710 block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);
711
712 if (block.next < block.end) {
713 *offset = block.next;
714 return VK_SUCCESS;
715 } else if (block.next == block.end) {
716 VkResult result = anv_block_pool_alloc(block_pool, block_size,
717 offset, padding);
718 if (result != VK_SUCCESS)
719 return result;
720 new.next = *offset + state_size;
721 new.end = *offset + block_size;
722 old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
723 if (old.next != block.next)
724 futex_wake(&pool->block.end, INT_MAX);
725 return result;
726 } else {
727 futex_wait(&pool->block.end, block.end, NULL);
728 goto restart;
729 }
730 }
731
732 static uint32_t
anv_state_pool_get_bucket(uint32_t size)733 anv_state_pool_get_bucket(uint32_t size)
734 {
735 unsigned size_log2 = util_logbase2_ceil(size);
736 assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
737 if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
738 size_log2 = ANV_MIN_STATE_SIZE_LOG2;
739 return size_log2 - ANV_MIN_STATE_SIZE_LOG2;
740 }
741
742 static uint32_t
anv_state_pool_get_bucket_size(uint32_t bucket)743 anv_state_pool_get_bucket_size(uint32_t bucket)
744 {
745 uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2;
746 return 1 << size_log2;
747 }
748
749 /** Helper to push a chunk into the state table.
750 *
751 * It creates 'count' entries into the state table and update their sizes,
752 * offsets and maps, also pushing them as "free" states.
753 */
754 static void
anv_state_pool_return_blocks(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t count,uint32_t block_size)755 anv_state_pool_return_blocks(struct anv_state_pool *pool,
756 uint32_t chunk_offset, uint32_t count,
757 uint32_t block_size)
758 {
759 /* Disallow returning 0 chunks */
760 assert(count != 0);
761
762 /* Make sure we always return chunks aligned to the block_size */
763 assert(chunk_offset % block_size == 0);
764
765 uint32_t st_idx;
766 UNUSED VkResult result = anv_state_table_add(&pool->table, &st_idx, count);
767 assert(result == VK_SUCCESS);
768 for (int i = 0; i < count; i++) {
769 /* update states that were added back to the state table */
770 struct anv_state *state_i = anv_state_table_get(&pool->table,
771 st_idx + i);
772 state_i->alloc_size = block_size;
773 state_i->offset = pool->start_offset + chunk_offset + block_size * i;
774 state_i->map = anv_block_pool_map(&pool->block_pool,
775 state_i->offset,
776 state_i->alloc_size);
777 }
778
779 uint32_t block_bucket = anv_state_pool_get_bucket(block_size);
780 anv_free_list_push(&pool->buckets[block_bucket].free_list,
781 &pool->table, st_idx, count);
782 }
783
784 /** Returns a chunk of memory back to the state pool.
785 *
786 * Do a two-level split. If chunk_size is bigger than divisor
787 * (pool->block_size), we return as many divisor sized blocks as we can, from
788 * the end of the chunk.
789 *
790 * The remaining is then split into smaller blocks (starting at small_size if
791 * it is non-zero), with larger blocks always being taken from the end of the
792 * chunk.
793 */
794 static void
anv_state_pool_return_chunk(struct anv_state_pool * pool,uint32_t chunk_offset,uint32_t chunk_size,uint32_t small_size)795 anv_state_pool_return_chunk(struct anv_state_pool *pool,
796 uint32_t chunk_offset, uint32_t chunk_size,
797 uint32_t small_size)
798 {
799 uint32_t divisor = pool->block_size;
800 uint32_t nblocks = chunk_size / divisor;
801 uint32_t rest = chunk_size - nblocks * divisor;
802
803 if (nblocks > 0) {
804 /* First return divisor aligned and sized chunks. We start returning
805 * larger blocks from the end of the chunk, since they should already be
806 * aligned to divisor. Also anv_state_pool_return_blocks() only accepts
807 * aligned chunks.
808 */
809 uint32_t offset = chunk_offset + rest;
810 anv_state_pool_return_blocks(pool, offset, nblocks, divisor);
811 }
812
813 chunk_size = rest;
814 divisor /= 2;
815
816 if (small_size > 0 && small_size < divisor)
817 divisor = small_size;
818
819 uint32_t min_size = 1 << ANV_MIN_STATE_SIZE_LOG2;
820
821 /* Just as before, return larger divisor aligned blocks from the end of the
822 * chunk first.
823 */
824 while (chunk_size > 0 && divisor >= min_size) {
825 nblocks = chunk_size / divisor;
826 rest = chunk_size - nblocks * divisor;
827 if (nblocks > 0) {
828 anv_state_pool_return_blocks(pool, chunk_offset + rest,
829 nblocks, divisor);
830 chunk_size = rest;
831 }
832 divisor /= 2;
833 }
834 }
835
836 static struct anv_state
anv_state_pool_alloc_no_vg(struct anv_state_pool * pool,uint32_t size,uint32_t align)837 anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
838 uint32_t size, uint32_t align)
839 {
840 uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align));
841
842 struct anv_state *state;
843 uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket);
844 int64_t offset;
845
846 /* Try free list first. */
847 state = anv_free_list_pop(&pool->buckets[bucket].free_list,
848 &pool->table);
849 if (state) {
850 assert(state->offset >= pool->start_offset);
851 goto done;
852 }
853
854 /* Try to grab a chunk from some larger bucket and split it up */
855 for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) {
856 state = anv_free_list_pop(&pool->buckets[b].free_list, &pool->table);
857 if (state) {
858 unsigned chunk_size = anv_state_pool_get_bucket_size(b);
859 int32_t chunk_offset = state->offset;
860
861 /* First lets update the state we got to its new size. offset and map
862 * remain the same.
863 */
864 state->alloc_size = alloc_size;
865
866 /* Now return the unused part of the chunk back to the pool as free
867 * blocks
868 *
869 * There are a couple of options as to what we do with it:
870 *
871 * 1) We could fully split the chunk into state.alloc_size sized
872 * pieces. However, this would mean that allocating a 16B
873 * state could potentially split a 2MB chunk into 512K smaller
874 * chunks. This would lead to unnecessary fragmentation.
875 *
876 * 2) The classic "buddy allocator" method would have us split the
877 * chunk in half and return one half. Then we would split the
878 * remaining half in half and return one half, and repeat as
879 * needed until we get down to the size we want. However, if
880 * you are allocating a bunch of the same size state (which is
881 * the common case), this means that every other allocation has
882 * to go up a level and every fourth goes up two levels, etc.
883 * This is not nearly as efficient as it could be if we did a
884 * little more work up-front.
885 *
886 * 3) Split the difference between (1) and (2) by doing a
887 * two-level split. If it's bigger than some fixed block_size,
888 * we split it into block_size sized chunks and return all but
889 * one of them. Then we split what remains into
890 * state.alloc_size sized chunks and return them.
891 *
892 * We choose something close to option (3), which is implemented with
893 * anv_state_pool_return_chunk(). That is done by returning the
894 * remaining of the chunk, with alloc_size as a hint of the size that
895 * we want the smaller chunk split into.
896 */
897 anv_state_pool_return_chunk(pool, chunk_offset + alloc_size,
898 chunk_size - alloc_size, alloc_size);
899 goto done;
900 }
901 }
902
903 uint32_t padding;
904 VkResult result =
905 anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
906 &pool->block_pool,
907 alloc_size,
908 pool->block_size,
909 &offset,
910 &padding);
911 if (result != VK_SUCCESS)
912 return ANV_STATE_NULL;
913
914 /* Every time we allocate a new state, add it to the state pool */
915 uint32_t idx = 0;
916 result = anv_state_table_add(&pool->table, &idx, 1);
917 assert(result == VK_SUCCESS);
918
919 state = anv_state_table_get(&pool->table, idx);
920 state->offset = pool->start_offset + offset;
921 state->alloc_size = alloc_size;
922 state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
923
924 if (padding > 0) {
925 uint32_t return_offset = offset - padding;
926 anv_state_pool_return_chunk(pool, return_offset, padding, 0);
927 }
928
929 done:
930 return *state;
931 }
932
933 struct anv_state
anv_state_pool_alloc(struct anv_state_pool * pool,uint32_t size,uint32_t align)934 anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)
935 {
936 if (size == 0)
937 return ANV_STATE_NULL;
938
939 struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align);
940 VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size));
941 return state;
942 }
943
944 static void
anv_state_pool_free_no_vg(struct anv_state_pool * pool,struct anv_state state)945 anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)
946 {
947 assert(util_is_power_of_two_or_zero(state.alloc_size));
948 unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);
949
950 assert(state.offset >= pool->start_offset);
951
952 anv_free_list_push(&pool->buckets[bucket].free_list,
953 &pool->table, state.idx, 1);
954 }
955
956 void
anv_state_pool_free(struct anv_state_pool * pool,struct anv_state state)957 anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)
958 {
959 if (state.alloc_size == 0)
960 return;
961
962 VG(VALGRIND_MEMPOOL_FREE(pool, state.map));
963 anv_state_pool_free_no_vg(pool, state);
964 }
965
966 struct anv_state_stream_block {
967 struct anv_state block;
968
969 /* The next block */
970 struct anv_state_stream_block *next;
971
972 #ifdef HAVE_VALGRIND
973 /* A pointer to the first user-allocated thing in this block. This is
974 * what valgrind sees as the start of the block.
975 */
976 void *_vg_ptr;
977 #endif
978 };
979
980 /* The state stream allocator is a one-shot, single threaded allocator for
981 * variable sized blocks. We use it for allocating dynamic state.
982 */
983 void
anv_state_stream_init(struct anv_state_stream * stream,struct anv_state_pool * state_pool,uint32_t block_size)984 anv_state_stream_init(struct anv_state_stream *stream,
985 struct anv_state_pool *state_pool,
986 uint32_t block_size)
987 {
988 stream->state_pool = state_pool;
989 stream->block_size = block_size;
990
991 stream->block = ANV_STATE_NULL;
992
993 /* Ensure that next + whatever > block_size. This way the first call to
994 * state_stream_alloc fetches a new block.
995 */
996 stream->next = block_size;
997
998 stream->total_size = 0;
999 util_dynarray_init(&stream->all_blocks, NULL);
1000
1001 VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
1002 }
1003
1004 void
anv_state_stream_finish(struct anv_state_stream * stream)1005 anv_state_stream_finish(struct anv_state_stream *stream)
1006 {
1007 util_dynarray_foreach(&stream->all_blocks, struct anv_state, block) {
1008 VG(VALGRIND_MEMPOOL_FREE(stream, block->map));
1009 VG(VALGRIND_MAKE_MEM_NOACCESS(block->map, block->alloc_size));
1010 anv_state_pool_free_no_vg(stream->state_pool, *block);
1011 }
1012 util_dynarray_fini(&stream->all_blocks);
1013
1014 VG(VALGRIND_DESTROY_MEMPOOL(stream));
1015 }
1016
1017 struct anv_state
anv_state_stream_alloc(struct anv_state_stream * stream,uint32_t size,uint32_t alignment)1018 anv_state_stream_alloc(struct anv_state_stream *stream,
1019 uint32_t size, uint32_t alignment)
1020 {
1021 if (size == 0)
1022 return ANV_STATE_NULL;
1023
1024 assert(alignment <= PAGE_SIZE);
1025
1026 uint32_t offset = align(stream->next, alignment);
1027 if (offset + size > stream->block.alloc_size) {
1028 uint32_t block_size = stream->block_size;
1029 if (block_size < size)
1030 block_size = util_next_power_of_two(size);
1031
1032 stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,
1033 block_size, PAGE_SIZE);
1034 util_dynarray_append(&stream->all_blocks,
1035 struct anv_state, stream->block);
1036 VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size));
1037
1038 /* Reset back to the start */
1039 stream->next = offset = 0;
1040 assert(offset + size <= stream->block.alloc_size);
1041 stream->total_size += block_size;
1042 }
1043 const bool new_block = stream->next == 0;
1044
1045 struct anv_state state = stream->block;
1046 state.offset += offset;
1047 state.alloc_size = size;
1048 state.map += offset;
1049
1050 stream->next = offset + size;
1051
1052 if (new_block) {
1053 assert(state.map == stream->block.map);
1054 VG(VALGRIND_MEMPOOL_ALLOC(stream, state.map, size));
1055 } else {
1056 /* This only updates the mempool. The newly allocated chunk is still
1057 * marked as NOACCESS. */
1058 VG(VALGRIND_MEMPOOL_CHANGE(stream, stream->block.map, stream->block.map,
1059 stream->next));
1060 /* Mark the newly allocated chunk as undefined */
1061 VG(VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size));
1062 }
1063
1064 return state;
1065 }
1066
1067 void
anv_state_reserved_pool_init(struct anv_state_reserved_pool * pool,struct anv_state_pool * parent,uint32_t count,uint32_t size,uint32_t alignment)1068 anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool,
1069 struct anv_state_pool *parent,
1070 uint32_t count, uint32_t size, uint32_t alignment)
1071 {
1072 pool->pool = parent;
1073 pool->reserved_blocks = ANV_FREE_LIST_EMPTY;
1074 pool->count = count;
1075
1076 for (unsigned i = 0; i < count; i++) {
1077 struct anv_state state = anv_state_pool_alloc(pool->pool, size, alignment);
1078 anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1079 }
1080 }
1081
1082 void
anv_state_reserved_pool_finish(struct anv_state_reserved_pool * pool)1083 anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool)
1084 {
1085 struct anv_state *state;
1086
1087 while ((state = anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table))) {
1088 anv_state_pool_free(pool->pool, *state);
1089 pool->count--;
1090 }
1091 assert(pool->count == 0);
1092 }
1093
1094 struct anv_state
anv_state_reserved_pool_alloc(struct anv_state_reserved_pool * pool)1095 anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool)
1096 {
1097 return *anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table);
1098 }
1099
1100 void
anv_state_reserved_pool_free(struct anv_state_reserved_pool * pool,struct anv_state state)1101 anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
1102 struct anv_state state)
1103 {
1104 anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
1105 }
1106
1107 void
anv_bo_pool_init(struct anv_bo_pool * pool,struct anv_device * device,const char * name,enum anv_bo_alloc_flags alloc_flags)1108 anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
1109 const char *name, enum anv_bo_alloc_flags alloc_flags)
1110 {
1111 pool->name = name;
1112 pool->device = device;
1113 pool->bo_alloc_flags = alloc_flags;
1114
1115 for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1116 util_sparse_array_free_list_init(&pool->free_list[i],
1117 &device->bo_cache.bo_map, 0,
1118 offsetof(struct anv_bo, free_index));
1119 }
1120
1121 VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
1122 }
1123
1124 void
anv_bo_pool_finish(struct anv_bo_pool * pool)1125 anv_bo_pool_finish(struct anv_bo_pool *pool)
1126 {
1127 for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
1128 while (1) {
1129 struct anv_bo *bo =
1130 util_sparse_array_free_list_pop_elem(&pool->free_list[i]);
1131 if (bo == NULL)
1132 break;
1133
1134 /* anv_device_release_bo is going to "free" it */
1135 VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1));
1136 anv_device_release_bo(pool->device, bo);
1137 }
1138 }
1139
1140 VG(VALGRIND_DESTROY_MEMPOOL(pool));
1141 }
1142
1143 VkResult
anv_bo_pool_alloc(struct anv_bo_pool * pool,uint32_t size,struct anv_bo ** bo_out)1144 anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
1145 struct anv_bo **bo_out)
1146 {
1147 const unsigned size_log2 = size < 4096 ? 12 : util_logbase2_ceil(size);
1148 const unsigned pow2_size = 1 << size_log2;
1149 const unsigned bucket = size_log2 - 12;
1150 assert(bucket < ARRAY_SIZE(pool->free_list));
1151
1152 struct anv_bo *bo =
1153 util_sparse_array_free_list_pop_elem(&pool->free_list[bucket]);
1154 if (bo != NULL) {
1155 VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1156 *bo_out = bo;
1157 return VK_SUCCESS;
1158 }
1159
1160 VkResult result = anv_device_alloc_bo(pool->device,
1161 pool->name,
1162 pow2_size,
1163 pool->bo_alloc_flags,
1164 0 /* explicit_address */,
1165 &bo);
1166 if (result != VK_SUCCESS)
1167 return result;
1168
1169 /* We want it to look like it came from this pool */
1170 VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
1171 VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
1172
1173 *bo_out = bo;
1174
1175 return VK_SUCCESS;
1176 }
1177
1178 void
anv_bo_pool_free(struct anv_bo_pool * pool,struct anv_bo * bo)1179 anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo)
1180 {
1181 VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));
1182
1183 assert(util_is_power_of_two_or_zero(bo->size));
1184 const unsigned size_log2 = util_logbase2_ceil(bo->size);
1185 const unsigned bucket = size_log2 - 12;
1186 assert(bucket < ARRAY_SIZE(pool->free_list));
1187
1188 assert(util_sparse_array_get(&pool->device->bo_cache.bo_map,
1189 bo->gem_handle) == bo);
1190 util_sparse_array_free_list_push(&pool->free_list[bucket],
1191 &bo->gem_handle, 1);
1192 }
1193
1194 // Scratch pool
1195
1196 void
anv_scratch_pool_init(struct anv_device * device,struct anv_scratch_pool * pool)1197 anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool)
1198 {
1199 memset(pool, 0, sizeof(*pool));
1200 }
1201
1202 void
anv_scratch_pool_finish(struct anv_device * device,struct anv_scratch_pool * pool)1203 anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool)
1204 {
1205 for (unsigned s = 0; s < ARRAY_SIZE(pool->bos[0]); s++) {
1206 for (unsigned i = 0; i < 16; i++) {
1207 if (pool->bos[i][s] != NULL)
1208 anv_device_release_bo(device, pool->bos[i][s]);
1209 }
1210 }
1211
1212 for (unsigned i = 0; i < 16; i++) {
1213 if (pool->surf_states[i].map != NULL) {
1214 anv_state_pool_free(&device->scratch_surface_state_pool,
1215 pool->surf_states[i]);
1216 }
1217 }
1218 }
1219
1220 struct anv_bo *
anv_scratch_pool_alloc(struct anv_device * device,struct anv_scratch_pool * pool,gl_shader_stage stage,unsigned per_thread_scratch)1221 anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
1222 gl_shader_stage stage, unsigned per_thread_scratch)
1223 {
1224 if (per_thread_scratch == 0)
1225 return NULL;
1226
1227 unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1228 assert(scratch_size_log2 < 16);
1229
1230 assert(stage < ARRAY_SIZE(pool->bos));
1231
1232 const struct intel_device_info *devinfo = device->info;
1233
1234 /* On GFX version 12.5, scratch access changed to a surface-based model.
1235 * Instead of each shader type having its own layout based on IDs passed
1236 * from the relevant fixed-function unit, all scratch access is based on
1237 * thread IDs like it always has been for compute.
1238 */
1239 if (devinfo->verx10 >= 125)
1240 stage = MESA_SHADER_COMPUTE;
1241
1242 struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]);
1243
1244 if (bo != NULL)
1245 return bo;
1246
1247 assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
1248 uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
1249
1250 /* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they
1251 * are still relative to the general state base address. When we emit
1252 * STATE_BASE_ADDRESS, we set general state base address to 0 and the size
1253 * to the maximum (1 page under 4GB). This allows us to just place the
1254 * scratch buffers anywhere we wish in the bottom 32 bits of address space
1255 * and just set the scratch base pointer in 3DSTATE_*S using a relocation.
1256 * However, in order to do so, we need to ensure that the kernel does not
1257 * place the scratch BO above the 32-bit boundary.
1258 *
1259 * NOTE: Technically, it can't go "anywhere" because the top page is off
1260 * limits. However, when EXEC_OBJECT_SUPPORTS_48B_ADDRESS is set, the
1261 * kernel allocates space using
1262 *
1263 * end = min_t(u64, end, (1ULL << 32) - I915_GTT_PAGE_SIZE);
1264 *
1265 * so nothing will ever touch the top page.
1266 */
1267 const enum anv_bo_alloc_flags alloc_flags =
1268 ANV_BO_ALLOC_INTERNAL |
1269 (devinfo->verx10 < 125 ? ANV_BO_ALLOC_32BIT_ADDRESS : 0);
1270 VkResult result = anv_device_alloc_bo(device, "scratch", size,
1271 alloc_flags,
1272 0 /* explicit_address */,
1273 &bo);
1274 if (result != VK_SUCCESS)
1275 return NULL; /* TODO */
1276
1277 struct anv_bo *current_bo =
1278 p_atomic_cmpxchg(&pool->bos[scratch_size_log2][stage], NULL, bo);
1279 if (current_bo) {
1280 anv_device_release_bo(device, bo);
1281 return current_bo;
1282 } else {
1283 return bo;
1284 }
1285 }
1286
1287 uint32_t
anv_scratch_pool_get_surf(struct anv_device * device,struct anv_scratch_pool * pool,unsigned per_thread_scratch)1288 anv_scratch_pool_get_surf(struct anv_device *device,
1289 struct anv_scratch_pool *pool,
1290 unsigned per_thread_scratch)
1291 {
1292 assert(device->info->verx10 >= 125);
1293
1294 if (per_thread_scratch == 0)
1295 return 0;
1296
1297 unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
1298 assert(scratch_size_log2 < 16);
1299
1300 uint32_t surf = p_atomic_read(&pool->surfs[scratch_size_log2]);
1301 if (surf > 0)
1302 return surf;
1303
1304 struct anv_bo *bo =
1305 anv_scratch_pool_alloc(device, pool, MESA_SHADER_COMPUTE,
1306 per_thread_scratch);
1307 struct anv_address addr = { .bo = bo };
1308
1309 struct anv_state state =
1310 anv_state_pool_alloc(&device->scratch_surface_state_pool,
1311 device->isl_dev.ss.size, 64);
1312
1313 isl_buffer_fill_state(&device->isl_dev, state.map,
1314 .address = anv_address_physical(addr),
1315 .size_B = bo->size,
1316 .mocs = anv_mocs(device, bo, 0),
1317 .format = ISL_FORMAT_RAW,
1318 .swizzle = ISL_SWIZZLE_IDENTITY,
1319 .stride_B = per_thread_scratch,
1320 .is_scratch = true);
1321
1322 uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
1323 0, state.offset);
1324 if (current) {
1325 anv_state_pool_free(&device->scratch_surface_state_pool, state);
1326 return current;
1327 } else {
1328 pool->surf_states[scratch_size_log2] = state;
1329 return state.offset;
1330 }
1331 }
1332
1333 VkResult
anv_bo_cache_init(struct anv_bo_cache * cache,struct anv_device * device)1334 anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device)
1335 {
1336 util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024);
1337
1338 if (pthread_mutex_init(&cache->mutex, NULL)) {
1339 util_sparse_array_finish(&cache->bo_map);
1340 return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
1341 "pthread_mutex_init failed: %m");
1342 }
1343
1344 return VK_SUCCESS;
1345 }
1346
1347 void
anv_bo_cache_finish(struct anv_bo_cache * cache)1348 anv_bo_cache_finish(struct anv_bo_cache *cache)
1349 {
1350 util_sparse_array_finish(&cache->bo_map);
1351 pthread_mutex_destroy(&cache->mutex);
1352 }
1353
1354 static void
anv_bo_unmap_close(struct anv_device * device,struct anv_bo * bo)1355 anv_bo_unmap_close(struct anv_device *device, struct anv_bo *bo)
1356 {
1357 if (bo->map && !bo->from_host_ptr)
1358 anv_device_unmap_bo(device, bo, bo->map, bo->size);
1359
1360 assert(bo->gem_handle != 0);
1361 device->kmd_backend->gem_close(device, bo);
1362 }
1363
1364 static void
anv_bo_vma_free(struct anv_device * device,struct anv_bo * bo)1365 anv_bo_vma_free(struct anv_device *device, struct anv_bo *bo)
1366 {
1367 if (bo->offset != 0 && !(bo->alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS)) {
1368 assert(bo->vma_heap != NULL);
1369 anv_vma_free(device, bo->vma_heap, bo->offset, bo->size);
1370 }
1371 bo->vma_heap = NULL;
1372 }
1373
1374 static void
anv_bo_finish(struct anv_device * device,struct anv_bo * bo)1375 anv_bo_finish(struct anv_device *device, struct anv_bo *bo)
1376 {
1377 /* Not releasing vma in case unbind fails */
1378 if (device->kmd_backend->vm_unbind_bo(device, bo) == 0)
1379 anv_bo_vma_free(device, bo);
1380
1381 anv_bo_unmap_close(device, bo);
1382 }
1383
1384 static VkResult
anv_bo_vma_alloc_or_close(struct anv_device * device,struct anv_bo * bo,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address)1385 anv_bo_vma_alloc_or_close(struct anv_device *device,
1386 struct anv_bo *bo,
1387 enum anv_bo_alloc_flags alloc_flags,
1388 uint64_t explicit_address)
1389 {
1390 assert(bo->vma_heap == NULL);
1391 assert(explicit_address == intel_48b_address(explicit_address));
1392
1393 uint32_t align = device->physical->info.mem_alignment;
1394
1395 /* If it's big enough to store a tiled resource, we need 64K alignment */
1396 if (bo->size >= 64 * 1024)
1397 align = MAX2(64 * 1024, align);
1398
1399 /* If we're using the AUX map, make sure we follow the required
1400 * alignment.
1401 */
1402 if (alloc_flags & ANV_BO_ALLOC_AUX_TT_ALIGNED)
1403 align = MAX2(intel_aux_map_get_alignment(device->aux_map_ctx), align);
1404
1405 /* Opportunistically align addresses to 2Mb when above 1Mb. We do this
1406 * because this gives an opportunity for the kernel to use Transparent Huge
1407 * Pages (the 2MB page table layout) for faster memory access.
1408 *
1409 * Only available on ICL+.
1410 */
1411 if (device->info->ver >= 11 && bo->size >= 1 * 1024 * 1024)
1412 align = MAX2(2 * 1024 * 1024, align);
1413
1414 if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
1415 bo->offset = intel_canonical_address(explicit_address);
1416 } else {
1417 bo->offset = anv_vma_alloc(device, bo->size, align, alloc_flags,
1418 explicit_address, &bo->vma_heap);
1419 if (bo->offset == 0) {
1420 anv_bo_unmap_close(device, bo);
1421 return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
1422 "failed to allocate virtual address for BO");
1423 }
1424 }
1425
1426 return VK_SUCCESS;
1427 }
1428
1429 enum intel_device_info_mmap_mode
anv_bo_get_mmap_mode(struct anv_device * device,struct anv_bo * bo)1430 anv_bo_get_mmap_mode(struct anv_device *device, struct anv_bo *bo)
1431 {
1432 enum anv_bo_alloc_flags alloc_flags = bo->alloc_flags;
1433
1434 if (device->info->has_set_pat_uapi)
1435 return anv_device_get_pat_entry(device, alloc_flags)->mmap;
1436
1437 if (anv_physical_device_has_vram(device->physical)) {
1438 if ((alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) ||
1439 (alloc_flags & ANV_BO_ALLOC_IMPORTED))
1440 return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1441
1442 return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1443 }
1444
1445 /* gfx9 atom */
1446 if (!device->info->has_llc) {
1447 /* user wants a cached and coherent memory but to achieve it without
1448 * LLC in older platforms DRM_IOCTL_I915_GEM_SET_CACHING needs to be
1449 * supported and set.
1450 */
1451 if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED)
1452 return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1453
1454 return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1455 }
1456
1457 if (alloc_flags & (ANV_BO_ALLOC_SCANOUT | ANV_BO_ALLOC_EXTERNAL))
1458 return INTEL_DEVICE_INFO_MMAP_MODE_WC;
1459
1460 return INTEL_DEVICE_INFO_MMAP_MODE_WB;
1461 }
1462
1463 VkResult
anv_device_alloc_bo(struct anv_device * device,const char * name,uint64_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t explicit_address,struct anv_bo ** bo_out)1464 anv_device_alloc_bo(struct anv_device *device,
1465 const char *name,
1466 uint64_t size,
1467 enum anv_bo_alloc_flags alloc_flags,
1468 uint64_t explicit_address,
1469 struct anv_bo **bo_out)
1470 {
1471 /* bo that needs CPU access needs to be HOST_CACHED, HOST_COHERENT or both */
1472 assert((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0 ||
1473 (alloc_flags & (ANV_BO_ALLOC_HOST_CACHED | ANV_BO_ALLOC_HOST_COHERENT)));
1474
1475 /* KMD requires a valid PAT index, so setting HOST_COHERENT/WC to bos that
1476 * don't need CPU access
1477 */
1478 if ((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0)
1479 alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
1480
1481 /* In platforms with LLC we can promote all bos to cached+coherent for free */
1482 const enum anv_bo_alloc_flags not_allowed_promotion = ANV_BO_ALLOC_SCANOUT |
1483 ANV_BO_ALLOC_EXTERNAL |
1484 ANV_BO_ALLOC_PROTECTED;
1485 if (device->info->has_llc && ((alloc_flags & not_allowed_promotion) == 0))
1486 alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
1487
1488 const uint32_t bo_flags =
1489 device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
1490
1491 /* The kernel is going to give us whole pages anyway. */
1492 size = align64(size, 4096);
1493
1494 const uint64_t ccs_offset = size;
1495 if (alloc_flags & ANV_BO_ALLOC_AUX_CCS) {
1496 assert(device->info->has_aux_map);
1497 size += DIV_ROUND_UP(size, intel_aux_get_main_to_aux_ratio(device->aux_map_ctx));
1498 size = align64(size, 4096);
1499 }
1500
1501 const struct intel_memory_class_instance *regions[2];
1502 uint32_t nregions = 0;
1503
1504 /* If we have vram size, we have multiple memory regions and should choose
1505 * one of them.
1506 */
1507 if (anv_physical_device_has_vram(device->physical)) {
1508 /* This always try to put the object in local memory. Here
1509 * vram_non_mappable & vram_mappable actually are the same region.
1510 */
1511 if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM)
1512 regions[nregions++] = device->physical->sys.region;
1513 else
1514 regions[nregions++] = device->physical->vram_non_mappable.region;
1515
1516 /* If the buffer is mapped on the host, add the system memory region.
1517 * This ensures that if the buffer cannot live in mappable local memory,
1518 * it can be spilled to system memory.
1519 */
1520 if (!(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) &&
1521 ((alloc_flags & ANV_BO_ALLOC_MAPPED) ||
1522 (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)))
1523 regions[nregions++] = device->physical->sys.region;
1524 } else {
1525 regions[nregions++] = device->physical->sys.region;
1526 }
1527
1528 uint64_t actual_size;
1529 uint32_t gem_handle = device->kmd_backend->gem_create(device, regions,
1530 nregions, size,
1531 alloc_flags,
1532 &actual_size);
1533 if (gem_handle == 0)
1534 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1535
1536 struct anv_bo new_bo = {
1537 .name = name,
1538 .gem_handle = gem_handle,
1539 .refcount = 1,
1540 .offset = -1,
1541 .size = size,
1542 .ccs_offset = ccs_offset,
1543 .actual_size = actual_size,
1544 .flags = bo_flags,
1545 .alloc_flags = alloc_flags,
1546 };
1547
1548 if (alloc_flags & ANV_BO_ALLOC_MAPPED) {
1549 VkResult result = anv_device_map_bo(device, &new_bo, 0, size, &new_bo.map);
1550 if (unlikely(result != VK_SUCCESS)) {
1551 device->kmd_backend->gem_close(device, &new_bo);
1552 return result;
1553 }
1554 }
1555
1556 VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1557 alloc_flags,
1558 explicit_address);
1559 if (result != VK_SUCCESS)
1560 return result;
1561
1562 if (device->kmd_backend->vm_bind_bo(device, &new_bo)) {
1563 anv_bo_vma_free(device, &new_bo);
1564 anv_bo_unmap_close(device, &new_bo);
1565 return vk_errorf(device, VK_ERROR_UNKNOWN, "vm bind failed");
1566 }
1567
1568 assert(new_bo.gem_handle);
1569
1570 /* If we just got this gem_handle from anv_bo_init_new then we know no one
1571 * else is touching this BO at the moment so we don't need to lock here.
1572 */
1573 struct anv_bo *bo = anv_device_lookup_bo(device, new_bo.gem_handle);
1574 *bo = new_bo;
1575
1576 *bo_out = bo;
1577
1578 ANV_RMV(bo_allocate, device, bo);
1579
1580 return VK_SUCCESS;
1581 }
1582
1583 VkResult
anv_device_map_bo(struct anv_device * device,struct anv_bo * bo,uint64_t offset,size_t size,void ** map_out)1584 anv_device_map_bo(struct anv_device *device,
1585 struct anv_bo *bo,
1586 uint64_t offset,
1587 size_t size,
1588 void **map_out)
1589 {
1590 assert(!bo->from_host_ptr);
1591 assert(size > 0);
1592
1593 void *map = anv_gem_mmap(device, bo, offset, size);
1594 if (unlikely(map == MAP_FAILED))
1595 return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
1596
1597 assert(map != NULL);
1598
1599 if (map_out)
1600 *map_out = map;
1601
1602 return VK_SUCCESS;
1603 }
1604
1605 void
anv_device_unmap_bo(struct anv_device * device,struct anv_bo * bo,void * map,size_t map_size)1606 anv_device_unmap_bo(struct anv_device *device,
1607 struct anv_bo *bo,
1608 void *map, size_t map_size)
1609 {
1610 assert(!bo->from_host_ptr);
1611
1612 anv_gem_munmap(device, map, map_size);
1613 }
1614
1615 VkResult
anv_device_import_bo_from_host_ptr(struct anv_device * device,void * host_ptr,uint32_t size,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1616 anv_device_import_bo_from_host_ptr(struct anv_device *device,
1617 void *host_ptr, uint32_t size,
1618 enum anv_bo_alloc_flags alloc_flags,
1619 uint64_t client_address,
1620 struct anv_bo **bo_out)
1621 {
1622 assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1623 ANV_BO_ALLOC_HOST_CACHED |
1624 ANV_BO_ALLOC_HOST_COHERENT |
1625 ANV_BO_ALLOC_AUX_CCS |
1626 ANV_BO_ALLOC_PROTECTED |
1627 ANV_BO_ALLOC_FIXED_ADDRESS)));
1628 assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
1629
1630 struct anv_bo_cache *cache = &device->bo_cache;
1631 const uint32_t bo_flags =
1632 device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
1633
1634 uint32_t gem_handle = device->kmd_backend->gem_create_userptr(device, host_ptr, size);
1635 if (!gem_handle)
1636 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1637
1638 pthread_mutex_lock(&cache->mutex);
1639
1640 struct anv_bo *bo = NULL;
1641 if (device->info->kmd_type == INTEL_KMD_TYPE_XE) {
1642 bo = vk_zalloc(&device->vk.alloc, sizeof(*bo), 8,
1643 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1644 if (!bo) {
1645 pthread_mutex_unlock(&cache->mutex);
1646 return VK_ERROR_OUT_OF_HOST_MEMORY;
1647 }
1648 } else {
1649 bo = anv_device_lookup_bo(device, gem_handle);
1650 }
1651
1652 if (bo->refcount > 0) {
1653 /* VK_EXT_external_memory_host doesn't require handling importing the
1654 * same pointer twice at the same time, but we don't get in the way. If
1655 * kernel gives us the same gem_handle, only succeed if the flags match.
1656 */
1657 assert(bo->gem_handle == gem_handle);
1658 if (bo_flags != bo->flags) {
1659 pthread_mutex_unlock(&cache->mutex);
1660 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1661 "same host pointer imported two different ways");
1662 }
1663
1664 if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
1665 (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
1666 pthread_mutex_unlock(&cache->mutex);
1667 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1668 "The same BO was imported with and without buffer "
1669 "device address");
1670 }
1671
1672 if (client_address && client_address != intel_48b_address(bo->offset)) {
1673 pthread_mutex_unlock(&cache->mutex);
1674 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1675 "The same BO was imported at two different "
1676 "addresses");
1677 }
1678
1679 __sync_fetch_and_add(&bo->refcount, 1);
1680 } else {
1681 alloc_flags |= ANV_BO_ALLOC_IMPORTED;
1682 struct anv_bo new_bo = {
1683 .name = "host-ptr",
1684 .gem_handle = gem_handle,
1685 .refcount = 1,
1686 .offset = -1,
1687 .size = size,
1688 .actual_size = size,
1689 .map = host_ptr,
1690 .flags = bo_flags,
1691 .alloc_flags = alloc_flags,
1692 .from_host_ptr = true,
1693 };
1694
1695 VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1696 alloc_flags,
1697 client_address);
1698 if (result != VK_SUCCESS) {
1699 pthread_mutex_unlock(&cache->mutex);
1700 return result;
1701 }
1702
1703 if (device->kmd_backend->vm_bind_bo(device, &new_bo)) {
1704 VkResult res = vk_errorf(device, VK_ERROR_UNKNOWN, "vm bind failed: %m");
1705 anv_bo_vma_free(device, &new_bo);
1706 pthread_mutex_unlock(&cache->mutex);
1707 return res;
1708 }
1709
1710 *bo = new_bo;
1711
1712 ANV_RMV(bo_allocate, device, bo);
1713 }
1714
1715 pthread_mutex_unlock(&cache->mutex);
1716 *bo_out = bo;
1717
1718 return VK_SUCCESS;
1719 }
1720
1721 VkResult
anv_device_import_bo(struct anv_device * device,int fd,enum anv_bo_alloc_flags alloc_flags,uint64_t client_address,struct anv_bo ** bo_out)1722 anv_device_import_bo(struct anv_device *device,
1723 int fd,
1724 enum anv_bo_alloc_flags alloc_flags,
1725 uint64_t client_address,
1726 struct anv_bo **bo_out)
1727 {
1728 assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
1729 ANV_BO_ALLOC_HOST_CACHED |
1730 ANV_BO_ALLOC_HOST_COHERENT |
1731 ANV_BO_ALLOC_FIXED_ADDRESS)));
1732 assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
1733
1734 struct anv_bo_cache *cache = &device->bo_cache;
1735
1736 pthread_mutex_lock(&cache->mutex);
1737
1738 uint32_t gem_handle = anv_gem_fd_to_handle(device, fd);
1739 if (!gem_handle) {
1740 pthread_mutex_unlock(&cache->mutex);
1741 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1742 }
1743
1744 struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
1745
1746 uint32_t bo_flags;
1747 VkResult result = anv_gem_import_bo_alloc_flags_to_bo_flags(device, bo,
1748 alloc_flags,
1749 &bo_flags);
1750 if (result != VK_SUCCESS) {
1751 pthread_mutex_unlock(&cache->mutex);
1752 return result;
1753 }
1754
1755 if (bo->refcount > 0) {
1756 if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
1757 (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
1758 pthread_mutex_unlock(&cache->mutex);
1759 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1760 "The same BO was imported with and without buffer "
1761 "device address");
1762 }
1763
1764 if (client_address && client_address != intel_48b_address(bo->offset)) {
1765 pthread_mutex_unlock(&cache->mutex);
1766 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1767 "The same BO was imported at two different "
1768 "addresses");
1769 }
1770
1771 __sync_fetch_and_add(&bo->refcount, 1);
1772 } else {
1773 alloc_flags |= ANV_BO_ALLOC_IMPORTED;
1774 struct anv_bo new_bo = {
1775 .name = "imported",
1776 .gem_handle = gem_handle,
1777 .refcount = 1,
1778 .offset = -1,
1779 .alloc_flags = alloc_flags,
1780 };
1781
1782 off_t size = lseek(fd, 0, SEEK_END);
1783 if (size == (off_t)-1) {
1784 device->kmd_backend->gem_close(device, &new_bo);
1785 pthread_mutex_unlock(&cache->mutex);
1786 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1787 }
1788 new_bo.size = size;
1789 new_bo.actual_size = size;
1790
1791 VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
1792 alloc_flags,
1793 client_address);
1794 if (result != VK_SUCCESS) {
1795 pthread_mutex_unlock(&cache->mutex);
1796 return result;
1797 }
1798
1799 if (device->kmd_backend->vm_bind_bo(device, &new_bo)) {
1800 anv_bo_vma_free(device, &new_bo);
1801 pthread_mutex_unlock(&cache->mutex);
1802 return vk_errorf(device, VK_ERROR_UNKNOWN, "vm bind failed");
1803 }
1804
1805 *bo = new_bo;
1806
1807 ANV_RMV(bo_allocate, device, bo);
1808 }
1809
1810 bo->flags = bo_flags;
1811
1812 pthread_mutex_unlock(&cache->mutex);
1813 *bo_out = bo;
1814
1815 return VK_SUCCESS;
1816 }
1817
1818 VkResult
anv_device_export_bo(struct anv_device * device,struct anv_bo * bo,int * fd_out)1819 anv_device_export_bo(struct anv_device *device,
1820 struct anv_bo *bo, int *fd_out)
1821 {
1822 assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
1823
1824 /* This BO must have been flagged external in order for us to be able
1825 * to export it. This is done based on external options passed into
1826 * anv_AllocateMemory.
1827 */
1828 assert(anv_bo_is_external(bo));
1829
1830 int fd = anv_gem_handle_to_fd(device, bo->gem_handle);
1831 if (fd < 0)
1832 return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
1833
1834 *fd_out = fd;
1835
1836 return VK_SUCCESS;
1837 }
1838
1839 VkResult
anv_device_get_bo_tiling(struct anv_device * device,struct anv_bo * bo,enum isl_tiling * tiling_out)1840 anv_device_get_bo_tiling(struct anv_device *device,
1841 struct anv_bo *bo,
1842 enum isl_tiling *tiling_out)
1843 {
1844 int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
1845 if (i915_tiling < 0) {
1846 return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
1847 "failed to get BO tiling: %m");
1848 }
1849
1850 *tiling_out = isl_tiling_from_i915_tiling(i915_tiling);
1851
1852 return VK_SUCCESS;
1853 }
1854
1855 VkResult
anv_device_set_bo_tiling(struct anv_device * device,struct anv_bo * bo,uint32_t row_pitch_B,enum isl_tiling tiling)1856 anv_device_set_bo_tiling(struct anv_device *device,
1857 struct anv_bo *bo,
1858 uint32_t row_pitch_B,
1859 enum isl_tiling tiling)
1860 {
1861 int ret = anv_gem_set_tiling(device, bo->gem_handle, row_pitch_B,
1862 isl_tiling_to_i915_tiling(tiling));
1863 if (ret) {
1864 return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
1865 "failed to set BO tiling: %m");
1866 }
1867
1868 return VK_SUCCESS;
1869 }
1870
1871 static bool
atomic_dec_not_one(uint32_t * counter)1872 atomic_dec_not_one(uint32_t *counter)
1873 {
1874 uint32_t old, val;
1875
1876 val = *counter;
1877 while (1) {
1878 if (val == 1)
1879 return false;
1880
1881 old = __sync_val_compare_and_swap(counter, val, val - 1);
1882 if (old == val)
1883 return true;
1884
1885 val = old;
1886 }
1887 }
1888
1889 void
anv_device_release_bo(struct anv_device * device,struct anv_bo * bo)1890 anv_device_release_bo(struct anv_device *device,
1891 struct anv_bo *bo)
1892 {
1893 struct anv_bo_cache *cache = &device->bo_cache;
1894 const bool bo_is_xe_userptr = device->info->kmd_type == INTEL_KMD_TYPE_XE &&
1895 bo->from_host_ptr;
1896 assert(bo_is_xe_userptr ||
1897 anv_device_lookup_bo(device, bo->gem_handle) == bo);
1898
1899 /* Try to decrement the counter but don't go below one. If this succeeds
1900 * then the refcount has been decremented and we are not the last
1901 * reference.
1902 */
1903 if (atomic_dec_not_one(&bo->refcount))
1904 return;
1905
1906 ANV_RMV(bo_destroy, device, bo);
1907
1908 pthread_mutex_lock(&cache->mutex);
1909
1910 /* We are probably the last reference since our attempt to decrement above
1911 * failed. However, we can't actually know until we are inside the mutex.
1912 * Otherwise, someone could import the BO between the decrement and our
1913 * taking the mutex.
1914 */
1915 if (unlikely(__sync_sub_and_fetch(&bo->refcount, 1) > 0)) {
1916 /* Turns out we're not the last reference. Unlock and bail. */
1917 pthread_mutex_unlock(&cache->mutex);
1918 return;
1919 }
1920 assert(bo->refcount == 0);
1921
1922 /* Memset the BO just in case. The refcount being zero should be enough to
1923 * prevent someone from assuming the data is valid but it's safer to just
1924 * stomp to zero just in case. We explicitly do this *before* we actually
1925 * close the GEM handle to ensure that if anyone allocates something and
1926 * gets the same GEM handle, the memset has already happen and won't stomp
1927 * all over any data they may write in this BO.
1928 */
1929 struct anv_bo old_bo = *bo;
1930
1931 if (bo_is_xe_userptr)
1932 vk_free(&device->vk.alloc, bo);
1933 else
1934 memset(bo, 0, sizeof(*bo));
1935
1936 anv_bo_finish(device, &old_bo);
1937
1938 /* Don't unlock until we've actually closed the BO. The whole point of
1939 * the BO cache is to ensure that we correctly handle races with creating
1940 * and releasing GEM handles and we don't want to let someone import the BO
1941 * again between mutex unlock and closing the GEM handle.
1942 */
1943 pthread_mutex_unlock(&cache->mutex);
1944 }
1945