1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include <xf86drm.h>
31
32 #include "anv_private.h"
33 #include "anv_measure.h"
34
35 #include "genxml/gen9_pack.h"
36 #include "genxml/genX_bits.h"
37
38 #include "util/perf/u_trace.h"
39
40 /** \file anv_batch_chain.c
41 *
42 * This file contains functions related to anv_cmd_buffer as a data
43 * structure. This involves everything required to create and destroy
44 * the actual batch buffers as well as link them together.
45 *
46 * It specifically does *not* contain any handling of actual vkCmd calls
47 * beyond vkCmdExecuteCommands.
48 */
49
50 /*-----------------------------------------------------------------------*
51 * Functions related to anv_reloc_list
52 *-----------------------------------------------------------------------*/
53
54 VkResult
anv_reloc_list_init(struct anv_reloc_list * list,const VkAllocationCallbacks * alloc,bool uses_relocs)55 anv_reloc_list_init(struct anv_reloc_list *list,
56 const VkAllocationCallbacks *alloc,
57 bool uses_relocs)
58 {
59 assert(alloc != NULL);
60 memset(list, 0, sizeof(*list));
61 list->uses_relocs = uses_relocs;
62 list->alloc = alloc;
63 return VK_SUCCESS;
64 }
65
66 static VkResult
anv_reloc_list_init_clone(struct anv_reloc_list * list,const struct anv_reloc_list * other_list)67 anv_reloc_list_init_clone(struct anv_reloc_list *list,
68 const struct anv_reloc_list *other_list)
69 {
70 list->dep_words = other_list->dep_words;
71
72 if (list->dep_words > 0) {
73 list->deps =
74 vk_alloc(list->alloc, list->dep_words * sizeof(BITSET_WORD), 8,
75 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
76 memcpy(list->deps, other_list->deps,
77 list->dep_words * sizeof(BITSET_WORD));
78 } else {
79 list->deps = NULL;
80 }
81
82 return VK_SUCCESS;
83 }
84
85 void
anv_reloc_list_finish(struct anv_reloc_list * list)86 anv_reloc_list_finish(struct anv_reloc_list *list)
87 {
88 vk_free(list->alloc, list->deps);
89 }
90
91 static VkResult
anv_reloc_list_grow_deps(struct anv_reloc_list * list,uint32_t min_num_words)92 anv_reloc_list_grow_deps(struct anv_reloc_list *list,
93 uint32_t min_num_words)
94 {
95 if (min_num_words <= list->dep_words)
96 return VK_SUCCESS;
97
98 uint32_t new_length = MAX2(32, list->dep_words * 2);
99 while (new_length < min_num_words)
100 new_length *= 2;
101
102 BITSET_WORD *new_deps =
103 vk_realloc(list->alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,
104 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
105 if (new_deps == NULL)
106 return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
107 list->deps = new_deps;
108
109 /* Zero out the new data */
110 memset(list->deps + list->dep_words, 0,
111 (new_length - list->dep_words) * sizeof(BITSET_WORD));
112 list->dep_words = new_length;
113
114 return VK_SUCCESS;
115 }
116
117 VkResult
anv_reloc_list_add_bo_impl(struct anv_reloc_list * list,struct anv_bo * target_bo)118 anv_reloc_list_add_bo_impl(struct anv_reloc_list *list,
119 struct anv_bo *target_bo)
120 {
121 /* This can happen with sparse resources. */
122 if (!target_bo)
123 return VK_SUCCESS;
124
125 uint32_t idx = target_bo->gem_handle;
126 VkResult result = anv_reloc_list_grow_deps(list,
127 (idx / BITSET_WORDBITS) + 1);
128 if (unlikely(result != VK_SUCCESS))
129 return result;
130
131 BITSET_SET(list->deps, idx);
132
133 return VK_SUCCESS;
134 }
135
136 static void
anv_reloc_list_clear(struct anv_reloc_list * list)137 anv_reloc_list_clear(struct anv_reloc_list *list)
138 {
139 if (list->dep_words > 0)
140 memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
141 }
142
143 VkResult
anv_reloc_list_append(struct anv_reloc_list * list,struct anv_reloc_list * other)144 anv_reloc_list_append(struct anv_reloc_list *list,
145 struct anv_reloc_list *other)
146 {
147 anv_reloc_list_grow_deps(list, other->dep_words);
148 for (uint32_t w = 0; w < other->dep_words; w++)
149 list->deps[w] |= other->deps[w];
150
151 return VK_SUCCESS;
152 }
153
154 /*-----------------------------------------------------------------------*
155 * Functions related to anv_batch
156 *-----------------------------------------------------------------------*/
157
158 static VkResult
anv_extend_batch(struct anv_batch * batch,uint32_t size)159 anv_extend_batch(struct anv_batch *batch, uint32_t size)
160 {
161 assert(batch->extend_cb != NULL);
162 VkResult result = batch->extend_cb(batch, size, batch->user_data);
163 if (result != VK_SUCCESS)
164 return anv_batch_set_error(batch, result);
165 return result;
166 }
167
168 void *
anv_batch_emit_dwords(struct anv_batch * batch,int num_dwords)169 anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
170 {
171 uint32_t size = num_dwords * 4;
172 if (batch->next + size > batch->end) {
173 if (anv_extend_batch(batch, size) != VK_SUCCESS)
174 return NULL;
175 }
176
177 void *p = batch->next;
178
179 batch->next += num_dwords * 4;
180 assert(batch->next <= batch->end);
181
182 return p;
183 }
184
185 /* Ensure enough contiguous space is available */
186 VkResult
anv_batch_emit_ensure_space(struct anv_batch * batch,uint32_t size)187 anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size)
188 {
189 if (batch->next + size > batch->end) {
190 VkResult result = anv_extend_batch(batch, size);
191 if (result != VK_SUCCESS)
192 return result;
193 }
194
195 assert(batch->next + size <= batch->end);
196
197 return VK_SUCCESS;
198 }
199
200 void
anv_batch_advance(struct anv_batch * batch,uint32_t size)201 anv_batch_advance(struct anv_batch *batch, uint32_t size)
202 {
203 assert(batch->next + size <= batch->end);
204
205 batch->next += size;
206 }
207
208 struct anv_address
anv_batch_address(struct anv_batch * batch,void * batch_location)209 anv_batch_address(struct anv_batch *batch, void *batch_location)
210 {
211 assert(batch->start <= batch_location);
212
213 /* Allow a jump at the current location of the batch. */
214 assert(batch->next >= batch_location);
215
216 return anv_address_add(batch->start_addr, batch_location - batch->start);
217 }
218
219 void
anv_batch_emit_batch(struct anv_batch * batch,struct anv_batch * other)220 anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
221 {
222 uint32_t size = other->next - other->start;
223 assert(size % 4 == 0);
224
225 if (batch->next + size > batch->end) {
226 if (anv_extend_batch(batch, size) != VK_SUCCESS)
227 return;
228 }
229
230 assert(batch->next + size <= batch->end);
231
232 VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size));
233 memcpy(batch->next, other->start, size);
234
235 VkResult result = anv_reloc_list_append(batch->relocs, other->relocs);
236 if (result != VK_SUCCESS) {
237 anv_batch_set_error(batch, result);
238 return;
239 }
240
241 batch->next += size;
242 }
243
244 /*-----------------------------------------------------------------------*
245 * Functions related to anv_batch_bo
246 *-----------------------------------------------------------------------*/
247
248 static VkResult
anv_batch_bo_create(struct anv_cmd_buffer * cmd_buffer,uint32_t size,struct anv_batch_bo ** bbo_out)249 anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
250 uint32_t size,
251 struct anv_batch_bo **bbo_out)
252 {
253 VkResult result;
254
255 struct anv_batch_bo *bbo = vk_zalloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
256 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
257 if (bbo == NULL)
258 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
259
260 result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
261 size, &bbo->bo);
262 if (result != VK_SUCCESS)
263 goto fail_alloc;
264
265 const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
266 result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->vk.pool->alloc, uses_relocs);
267 if (result != VK_SUCCESS)
268 goto fail_bo_alloc;
269
270 *bbo_out = bbo;
271
272 return VK_SUCCESS;
273
274 fail_bo_alloc:
275 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
276 fail_alloc:
277 vk_free(&cmd_buffer->vk.pool->alloc, bbo);
278
279 return result;
280 }
281
282 static VkResult
anv_batch_bo_clone(struct anv_cmd_buffer * cmd_buffer,const struct anv_batch_bo * other_bbo,struct anv_batch_bo ** bbo_out)283 anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
284 const struct anv_batch_bo *other_bbo,
285 struct anv_batch_bo **bbo_out)
286 {
287 VkResult result;
288
289 struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
290 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
291 if (bbo == NULL)
292 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
293
294 result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
295 other_bbo->bo->size, &bbo->bo);
296 if (result != VK_SUCCESS)
297 goto fail_alloc;
298
299 result = anv_reloc_list_init_clone(&bbo->relocs, &other_bbo->relocs);
300 if (result != VK_SUCCESS)
301 goto fail_bo_alloc;
302
303 bbo->length = other_bbo->length;
304 memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length);
305 *bbo_out = bbo;
306
307 return VK_SUCCESS;
308
309 fail_bo_alloc:
310 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
311 fail_alloc:
312 vk_free(&cmd_buffer->vk.pool->alloc, bbo);
313
314 return result;
315 }
316
317 static void
anv_batch_bo_start(struct anv_batch_bo * bbo,struct anv_batch * batch,size_t batch_padding)318 anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch,
319 size_t batch_padding)
320 {
321 anv_batch_set_storage(batch, (struct anv_address) { .bo = bbo->bo, },
322 bbo->bo->map, bbo->bo->size - batch_padding);
323 batch->relocs = &bbo->relocs;
324 anv_reloc_list_clear(&bbo->relocs);
325 }
326
327 static void
anv_batch_bo_continue(struct anv_batch_bo * bbo,struct anv_batch * batch,size_t batch_padding)328 anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch,
329 size_t batch_padding)
330 {
331 batch->start_addr = (struct anv_address) { .bo = bbo->bo, };
332 batch->start = bbo->bo->map;
333 batch->next = bbo->bo->map + bbo->length;
334 batch->end = bbo->bo->map + bbo->bo->size - batch_padding;
335 batch->relocs = &bbo->relocs;
336 }
337
338 static void
anv_batch_bo_finish(struct anv_batch_bo * bbo,struct anv_batch * batch)339 anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)
340 {
341 assert(batch->start == bbo->bo->map);
342 bbo->length = batch->next - batch->start;
343 VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length));
344 }
345
346 static void
anv_batch_bo_link(struct anv_cmd_buffer * cmd_buffer,struct anv_batch_bo * prev_bbo,struct anv_batch_bo * next_bbo,uint32_t next_bbo_offset)347 anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
348 struct anv_batch_bo *prev_bbo,
349 struct anv_batch_bo *next_bbo,
350 uint32_t next_bbo_offset)
351 {
352 const uint32_t bb_start_offset =
353 prev_bbo->length - GFX9_MI_BATCH_BUFFER_START_length * 4;
354 ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset;
355
356 /* Make sure we're looking at a MI_BATCH_BUFFER_START */
357 assert(((*bb_start >> 29) & 0x07) == 0);
358 assert(((*bb_start >> 23) & 0x3f) == 49);
359
360 uint64_t *map = prev_bbo->bo->map + bb_start_offset + 4;
361 *map = intel_canonical_address(next_bbo->bo->offset + next_bbo_offset);
362
363 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
364 if (cmd_buffer->device->physical->memory.need_flush &&
365 anv_bo_needs_host_cache_flush(prev_bbo->bo->alloc_flags))
366 intel_flush_range(map, sizeof(uint64_t));
367 #endif
368 }
369
370 static void
anv_batch_bo_destroy(struct anv_batch_bo * bbo,struct anv_cmd_buffer * cmd_buffer)371 anv_batch_bo_destroy(struct anv_batch_bo *bbo,
372 struct anv_cmd_buffer *cmd_buffer)
373 {
374 anv_reloc_list_finish(&bbo->relocs);
375 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
376 vk_free(&cmd_buffer->vk.pool->alloc, bbo);
377 }
378
379 static VkResult
anv_batch_bo_list_clone(const struct list_head * list,struct anv_cmd_buffer * cmd_buffer,struct list_head * new_list)380 anv_batch_bo_list_clone(const struct list_head *list,
381 struct anv_cmd_buffer *cmd_buffer,
382 struct list_head *new_list)
383 {
384 VkResult result = VK_SUCCESS;
385
386 list_inithead(new_list);
387
388 struct anv_batch_bo *prev_bbo = NULL;
389 list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
390 struct anv_batch_bo *new_bbo = NULL;
391 result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo);
392 if (result != VK_SUCCESS)
393 break;
394 list_addtail(&new_bbo->link, new_list);
395
396 if (prev_bbo)
397 anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0);
398
399 prev_bbo = new_bbo;
400 }
401
402 if (result != VK_SUCCESS) {
403 list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) {
404 list_del(&bbo->link);
405 anv_batch_bo_destroy(bbo, cmd_buffer);
406 }
407 }
408
409 return result;
410 }
411
412 /*-----------------------------------------------------------------------*
413 * Functions related to anv_batch_bo
414 *-----------------------------------------------------------------------*/
415
416 static struct anv_batch_bo *
anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer * cmd_buffer)417 anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)
418 {
419 return list_entry(cmd_buffer->batch_bos.prev, struct anv_batch_bo, link);
420 }
421
422 static struct anv_batch_bo *
anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer * cmd_buffer)423 anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer *cmd_buffer)
424 {
425 return list_entry(cmd_buffer->generation.batch_bos.prev, struct anv_batch_bo, link);
426 }
427
428 struct anv_address
anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer * cmd_buffer)429 anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
430 {
431 /* Only graphics & compute queues need binding tables. */
432 if (!(cmd_buffer->queue_family->queueFlags & (VK_QUEUE_GRAPHICS_BIT |
433 VK_QUEUE_COMPUTE_BIT)))
434 return ANV_NULL_ADDRESS;
435
436 /* If we've never allocated a binding table block, do it now. Otherwise we
437 * would trigger another STATE_BASE_ADDRESS emission which would require an
438 * additional bunch of flushes/stalls.
439 */
440 if (u_vector_length(&cmd_buffer->bt_block_states) == 0) {
441 VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
442 if (result != VK_SUCCESS) {
443 anv_batch_set_error(&cmd_buffer->batch, result);
444 return ANV_NULL_ADDRESS;
445 }
446 }
447
448 struct anv_state_pool *pool = &cmd_buffer->device->binding_table_pool;
449 struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
450 return (struct anv_address) {
451 .bo = pool->block_pool.bo,
452 .offset = bt_block->offset - pool->start_offset,
453 };
454 }
455
456 static void
emit_batch_buffer_start(struct anv_batch * batch,struct anv_bo * bo,uint32_t offset)457 emit_batch_buffer_start(struct anv_batch *batch,
458 struct anv_bo *bo, uint32_t offset)
459 {
460 anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
461 bbs.DWordLength = GFX9_MI_BATCH_BUFFER_START_length -
462 GFX9_MI_BATCH_BUFFER_START_length_bias;
463 bbs.SecondLevelBatchBuffer = Firstlevelbatch;
464 bbs.AddressSpaceIndicator = ASI_PPGTT;
465 bbs.BatchBufferStartAddress = (struct anv_address) { bo, offset };
466 }
467 }
468
469 enum anv_cmd_buffer_batch {
470 ANV_CMD_BUFFER_BATCH_MAIN,
471 ANV_CMD_BUFFER_BATCH_GENERATION,
472 };
473
474 static void
cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_batch_bo * bbo,enum anv_cmd_buffer_batch batch_type)475 cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
476 struct anv_batch_bo *bbo,
477 enum anv_cmd_buffer_batch batch_type)
478 {
479 struct anv_batch *batch =
480 batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
481 &cmd_buffer->generation.batch : &cmd_buffer->batch;
482 struct anv_batch_bo *current_bbo =
483 batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
484 anv_cmd_buffer_current_generation_batch_bo(cmd_buffer) :
485 anv_cmd_buffer_current_batch_bo(cmd_buffer);
486
487 /* We set the end of the batch a little short so we would be sure we
488 * have room for the chaining command. Since we're about to emit the
489 * chaining command, let's set it back where it should go.
490 */
491 batch->end += GFX9_MI_BATCH_BUFFER_START_length * 4;
492 assert(batch->end == current_bbo->bo->map + current_bbo->bo->size);
493
494 emit_batch_buffer_start(batch, bbo->bo, 0);
495
496 anv_batch_bo_finish(current_bbo, batch);
497
498 /* Add the current amount of data written in the current_bbo to the command
499 * buffer.
500 */
501 cmd_buffer->total_batch_size += current_bbo->length;
502 }
503
504 static void
anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer * cmd_buffer_from,struct anv_cmd_buffer * cmd_buffer_to)505 anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
506 struct anv_cmd_buffer *cmd_buffer_to)
507 {
508 uint32_t *bb_start = cmd_buffer_from->batch_end;
509
510 struct anv_batch_bo *last_bbo =
511 list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link);
512 struct anv_batch_bo *first_bbo =
513 list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);
514
515 struct GFX9_MI_BATCH_BUFFER_START gen_bb_start = {
516 __anv_cmd_header(GFX9_MI_BATCH_BUFFER_START),
517 .SecondLevelBatchBuffer = Firstlevelbatch,
518 .AddressSpaceIndicator = ASI_PPGTT,
519 .BatchBufferStartAddress = (struct anv_address) { first_bbo->bo, 0 },
520 };
521 struct anv_batch local_batch = {
522 .start = last_bbo->bo->map,
523 .end = last_bbo->bo->map + last_bbo->bo->size,
524 .relocs = &last_bbo->relocs,
525 .alloc = &cmd_buffer_from->vk.pool->alloc,
526 };
527
528 __anv_cmd_pack(GFX9_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
529
530 last_bbo->chained = true;
531 }
532
533 static void
anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer * cmd_buffer)534 anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)
535 {
536 struct anv_batch_bo *last_bbo =
537 list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
538 last_bbo->chained = false;
539
540 uint32_t *batch = cmd_buffer->batch_end;
541 anv_pack_struct(batch, GFX9_MI_BATCH_BUFFER_END,
542 __anv_cmd_header(GFX9_MI_BATCH_BUFFER_END));
543 }
544
545 static VkResult
anv_cmd_buffer_chain_batch(struct anv_batch * batch,uint32_t size,void * _data)546 anv_cmd_buffer_chain_batch(struct anv_batch *batch, uint32_t size, void *_data)
547 {
548 /* The caller should not need that much space. Otherwise it should split
549 * its commands.
550 */
551 assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
552
553 struct anv_cmd_buffer *cmd_buffer = _data;
554 struct anv_batch_bo *new_bbo = NULL;
555 /* Amount of reserved space at the end of the batch to account for the
556 * chaining instruction.
557 */
558 const uint32_t batch_padding = GFX9_MI_BATCH_BUFFER_START_length * 4;
559 /* Cap reallocation to chunk. */
560 uint32_t alloc_size = MIN2(
561 MAX2(batch->allocated_batch_size, size + batch_padding),
562 ANV_MAX_CMD_BUFFER_BATCH_SIZE);
563
564 VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
565 if (result != VK_SUCCESS)
566 return result;
567
568 batch->allocated_batch_size += alloc_size;
569
570 struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
571 if (seen_bbo == NULL) {
572 anv_batch_bo_destroy(new_bbo, cmd_buffer);
573 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
574 }
575 *seen_bbo = new_bbo;
576
577 cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo, ANV_CMD_BUFFER_BATCH_MAIN);
578
579 list_addtail(&new_bbo->link, &cmd_buffer->batch_bos);
580
581 anv_batch_bo_start(new_bbo, batch, batch_padding);
582
583 return VK_SUCCESS;
584 }
585
586 static VkResult
anv_cmd_buffer_chain_generation_batch(struct anv_batch * batch,uint32_t size,void * _data)587 anv_cmd_buffer_chain_generation_batch(struct anv_batch *batch, uint32_t size, void *_data)
588 {
589 /* The caller should not need that much space. Otherwise it should split
590 * its commands.
591 */
592 assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
593
594 struct anv_cmd_buffer *cmd_buffer = _data;
595 struct anv_batch_bo *new_bbo = NULL;
596 /* Cap reallocation to chunk. */
597 uint32_t alloc_size = MIN2(
598 MAX2(batch->allocated_batch_size, size),
599 ANV_MAX_CMD_BUFFER_BATCH_SIZE);
600
601 VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
602 if (result != VK_SUCCESS)
603 return result;
604
605 batch->allocated_batch_size += alloc_size;
606
607 struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
608 if (seen_bbo == NULL) {
609 anv_batch_bo_destroy(new_bbo, cmd_buffer);
610 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
611 }
612 *seen_bbo = new_bbo;
613
614 if (!list_is_empty(&cmd_buffer->generation.batch_bos)) {
615 cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo,
616 ANV_CMD_BUFFER_BATCH_GENERATION);
617 }
618
619 list_addtail(&new_bbo->link, &cmd_buffer->generation.batch_bos);
620
621 anv_batch_bo_start(new_bbo, batch, GFX9_MI_BATCH_BUFFER_START_length * 4);
622
623 return VK_SUCCESS;
624 }
625
626 /** Allocate a binding table
627 *
628 * This function allocates a binding table. This is a bit more complicated
629 * than one would think due to a combination of Vulkan driver design and some
630 * unfortunate hardware restrictions.
631 *
632 * The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for
633 * the binding table pointer which means that all binding tables need to live
634 * in the bottom 64k of surface state base address. The way the GL driver has
635 * classically dealt with this restriction is to emit all surface states
636 * on-the-fly into the batch and have a batch buffer smaller than 64k. This
637 * isn't really an option in Vulkan for a couple of reasons:
638 *
639 * 1) In Vulkan, we have growing (or chaining) batches so surface states have
640 * to live in their own buffer and we have to be able to re-emit
641 * STATE_BASE_ADDRESS as needed which requires a full pipeline stall. In
642 * order to avoid emitting STATE_BASE_ADDRESS any more often than needed
643 * (it's not that hard to hit 64k of just binding tables), we allocate
644 * surface state objects up-front when VkImageView is created. In order
645 * for this to work, surface state objects need to be allocated from a
646 * global buffer.
647 *
648 * 2) We tried to design the surface state system in such a way that it's
649 * already ready for bindless texturing. The way bindless texturing works
650 * on our hardware is that you have a big pool of surface state objects
651 * (with its own state base address) and the bindless handles are simply
652 * offsets into that pool. With the architecture we chose, we already
653 * have that pool and it's exactly the same pool that we use for regular
654 * surface states so we should already be ready for bindless.
655 *
656 * 3) For render targets, we need to be able to fill out the surface states
657 * later in vkBeginRenderPass so that we can assign clear colors
658 * correctly. One way to do this would be to just create the surface
659 * state data and then repeatedly copy it into the surface state BO every
660 * time we have to re-emit STATE_BASE_ADDRESS. While this works, it's
661 * rather annoying and just being able to allocate them up-front and
662 * re-use them for the entire render pass.
663 *
664 * While none of these are technically blockers for emitting state on the fly
665 * like we do in GL, the ability to have a single surface state pool is
666 * simplifies things greatly. Unfortunately, it comes at a cost...
667 *
668 * Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't
669 * place the binding tables just anywhere in surface state base address.
670 * Because 64k isn't a whole lot of space, we can't simply restrict the
671 * surface state buffer to 64k, we have to be more clever. The solution we've
672 * chosen is to have a block pool with a maximum size of 2G that starts at
673 * zero and grows in both directions. All surface states are allocated from
674 * the top of the pool (positive offsets) and we allocate blocks (< 64k) of
675 * binding tables from the bottom of the pool (negative offsets). Every time
676 * we allocate a new binding table block, we set surface state base address to
677 * point to the bottom of the binding table block. This way all of the
678 * binding tables in the block are in the bottom 64k of surface state base
679 * address. When we fill out the binding table, we add the distance between
680 * the bottom of our binding table block and zero of the block pool to the
681 * surface state offsets so that they are correct relative to out new surface
682 * state base address at the bottom of the binding table block.
683 *
684 * \param[in] entries The number of surface state entries the binding
685 * table should be able to hold.
686 *
687 * \param[out] state_offset The offset surface surface state base address
688 * where the surface states live. This must be
689 * added to the surface state offset when it is
690 * written into the binding table entry.
691 *
692 * \return An anv_state representing the binding table
693 */
694 struct anv_state
anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer * cmd_buffer,uint32_t entries,uint32_t * state_offset)695 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
696 uint32_t entries, uint32_t *state_offset)
697 {
698 if (u_vector_length(&cmd_buffer->bt_block_states) == 0)
699 return (struct anv_state) { 0 };
700
701 struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
702
703 uint32_t bt_size = align(entries * 4, 32);
704
705 struct anv_state state = cmd_buffer->bt_next;
706 if (bt_size > state.alloc_size)
707 return (struct anv_state) { 0 };
708
709 state.alloc_size = bt_size;
710 cmd_buffer->bt_next.offset += bt_size;
711 cmd_buffer->bt_next.map += bt_size;
712 cmd_buffer->bt_next.alloc_size -= bt_size;
713
714 if (cmd_buffer->device->info->verx10 >= 125) {
715 /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to change the binding
716 * table address independently from surface state base address. We no
717 * longer need any sort of offsetting.
718 */
719 *state_offset = 0;
720 } else {
721 assert(bt_block->offset < 0);
722 *state_offset = -bt_block->offset;
723 }
724
725 return state;
726 }
727
728 struct anv_state
anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer * cmd_buffer,uint32_t count)729 anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer *cmd_buffer,
730 uint32_t count)
731 {
732 if (count == 0)
733 return ANV_STATE_NULL;
734 struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
735 struct anv_state state =
736 anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
737 count * isl_dev->ss.size,
738 isl_dev->ss.align);
739 if (state.map == NULL)
740 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
741 return state;
742 }
743
744 struct anv_state
anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer * cmd_buffer,uint32_t size,uint32_t alignment)745 anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
746 uint32_t size, uint32_t alignment)
747 {
748 if (size == 0)
749 return ANV_STATE_NULL;
750 struct anv_state state =
751 anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
752 size, alignment);
753 if (state.map == NULL)
754 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
755 return state;
756 }
757
758 struct anv_state
anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer * cmd_buffer,uint32_t size,uint32_t alignment)759 anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer *cmd_buffer,
760 uint32_t size, uint32_t alignment)
761 {
762 if (size == 0)
763 return ANV_STATE_NULL;
764 struct anv_state state =
765 anv_state_stream_alloc(&cmd_buffer->general_state_stream,
766 size, alignment);
767 if (state.map == NULL)
768 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
769 return state;
770 }
771
772 /** Allocate space associated with a command buffer
773 *
774 * Some commands like vkCmdBuildAccelerationStructuresKHR() can end up needing
775 * large amount of temporary buffers. This function is here to deal with those
776 * potentially larger allocations, using a side BO if needed.
777 *
778 */
779 struct anv_cmd_alloc
anv_cmd_buffer_alloc_space(struct anv_cmd_buffer * cmd_buffer,size_t size,uint32_t alignment,bool mapped)780 anv_cmd_buffer_alloc_space(struct anv_cmd_buffer *cmd_buffer,
781 size_t size, uint32_t alignment,
782 bool mapped)
783 {
784 /* Below 16k, source memory from dynamic state, otherwise allocate a BO. */
785 if (size < 16 * 1024) {
786 struct anv_state state =
787 anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
788 size, alignment);
789 if (state.map == NULL) {
790 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
791 return (struct anv_cmd_alloc) {
792 .address = ANV_NULL_ADDRESS,
793 };
794 }
795
796 return (struct anv_cmd_alloc) {
797 .address = anv_state_pool_state_address(
798 &cmd_buffer->device->dynamic_state_pool,
799 state),
800 .map = state.map,
801 .size = size,
802 };
803 }
804
805 assert(alignment <= 4096);
806
807 struct anv_bo *bo = NULL;
808 VkResult result =
809 anv_bo_pool_alloc(mapped ?
810 &cmd_buffer->device->batch_bo_pool :
811 &cmd_buffer->device->bvh_bo_pool,
812 align(size, 4096), &bo);
813 if (result != VK_SUCCESS) {
814 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
815 return ANV_EMPTY_ALLOC;
816 }
817
818 struct anv_bo **bo_entry =
819 u_vector_add(&cmd_buffer->dynamic_bos);
820 if (bo_entry == NULL) {
821 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
822 anv_bo_pool_free(bo->map != NULL ?
823 &cmd_buffer->device->batch_bo_pool :
824 &cmd_buffer->device->bvh_bo_pool, bo);
825 return ANV_EMPTY_ALLOC;
826 }
827 *bo_entry = bo;
828
829 return (struct anv_cmd_alloc) {
830 .address = (struct anv_address) { .bo = bo },
831 .map = bo->map,
832 .size = size,
833 };
834 }
835
836 VkResult
anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer * cmd_buffer)837 anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
838 {
839 struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);
840 if (bt_block == NULL) {
841 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
842 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
843 }
844
845 *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device);
846
847 /* The bt_next state is a rolling state (we update it as we suballocate
848 * from it) which is relative to the start of the binding table block.
849 */
850 cmd_buffer->bt_next = *bt_block;
851 cmd_buffer->bt_next.offset = 0;
852
853 return VK_SUCCESS;
854 }
855
856 VkResult
anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)857 anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
858 {
859 struct anv_batch_bo *batch_bo = NULL;
860 VkResult result;
861
862 list_inithead(&cmd_buffer->batch_bos);
863
864 cmd_buffer->total_batch_size = 0;
865
866 result = anv_batch_bo_create(cmd_buffer,
867 ANV_MIN_CMD_BUFFER_BATCH_SIZE,
868 &batch_bo);
869 if (result != VK_SUCCESS)
870 return result;
871
872 list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);
873
874 cmd_buffer->batch.alloc = &cmd_buffer->vk.pool->alloc;
875 cmd_buffer->batch.user_data = cmd_buffer;
876 cmd_buffer->batch.allocated_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;
877
878 cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
879 cmd_buffer->batch.engine_class = cmd_buffer->queue_family->engine_class;
880
881 anv_batch_bo_start(batch_bo, &cmd_buffer->batch,
882 GFX9_MI_BATCH_BUFFER_START_length * 4);
883
884 /* Generation batch is initialized empty since it's possible it won't be
885 * used.
886 */
887 list_inithead(&cmd_buffer->generation.batch_bos);
888
889 cmd_buffer->generation.batch.alloc = &cmd_buffer->vk.pool->alloc;
890 cmd_buffer->generation.batch.user_data = cmd_buffer;
891 cmd_buffer->generation.batch.allocated_batch_size = 0;
892 cmd_buffer->generation.batch.extend_cb = anv_cmd_buffer_chain_generation_batch;
893 cmd_buffer->generation.batch.engine_class =
894 cmd_buffer->queue_family->engine_class;
895
896 int success = u_vector_init_pow2(&cmd_buffer->seen_bbos, 8,
897 sizeof(struct anv_bo *));
898 if (!success)
899 goto fail_batch_bo;
900
901 *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
902
903 success = u_vector_init(&cmd_buffer->bt_block_states, 8,
904 sizeof(struct anv_state));
905 if (!success)
906 goto fail_seen_bbos;
907
908 const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
909 result = anv_reloc_list_init(&cmd_buffer->surface_relocs,
910 &cmd_buffer->vk.pool->alloc, uses_relocs);
911 if (result != VK_SUCCESS)
912 goto fail_bt_blocks;
913
914 return VK_SUCCESS;
915
916 fail_bt_blocks:
917 u_vector_finish(&cmd_buffer->bt_block_states);
918 fail_seen_bbos:
919 u_vector_finish(&cmd_buffer->seen_bbos);
920 fail_batch_bo:
921 anv_batch_bo_destroy(batch_bo, cmd_buffer);
922
923 return result;
924 }
925
926 void
anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)927 anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
928 {
929 struct anv_state *bt_block;
930 u_vector_foreach(bt_block, &cmd_buffer->bt_block_states)
931 anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
932 u_vector_finish(&cmd_buffer->bt_block_states);
933
934 anv_reloc_list_finish(&cmd_buffer->surface_relocs);
935
936 u_vector_finish(&cmd_buffer->seen_bbos);
937
938 /* Destroy all of the batch buffers */
939 list_for_each_entry_safe(struct anv_batch_bo, bbo,
940 &cmd_buffer->batch_bos, link) {
941 list_del(&bbo->link);
942 anv_batch_bo_destroy(bbo, cmd_buffer);
943 }
944 /* Also destroy all generation batch buffers */
945 list_for_each_entry_safe(struct anv_batch_bo, bbo,
946 &cmd_buffer->generation.batch_bos, link) {
947 list_del(&bbo->link);
948 anv_batch_bo_destroy(bbo, cmd_buffer);
949 }
950
951 if (cmd_buffer->generation.ring_bo) {
952 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
953 cmd_buffer->generation.ring_bo);
954 }
955 }
956
957 void
anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)958 anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
959 {
960 /* Delete all but the first batch bo */
961 assert(!list_is_empty(&cmd_buffer->batch_bos));
962 while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) {
963 struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
964 list_del(&bbo->link);
965 anv_batch_bo_destroy(bbo, cmd_buffer);
966 }
967 assert(!list_is_empty(&cmd_buffer->batch_bos));
968
969 anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer),
970 &cmd_buffer->batch,
971 GFX9_MI_BATCH_BUFFER_START_length * 4);
972
973 while (u_vector_length(&cmd_buffer->bt_block_states) > 0) {
974 struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);
975 anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
976 }
977 cmd_buffer->bt_next = ANV_STATE_NULL;
978
979 anv_reloc_list_clear(&cmd_buffer->surface_relocs);
980
981 /* Reset the list of seen buffers */
982 cmd_buffer->seen_bbos.head = 0;
983 cmd_buffer->seen_bbos.tail = 0;
984
985 struct anv_batch_bo *first_bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
986
987 *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo;
988
989 assert(first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);
990 cmd_buffer->batch.allocated_batch_size = first_bbo->bo->size;
991
992 /* Delete all generation batch bos */
993 list_for_each_entry_safe(struct anv_batch_bo, bbo,
994 &cmd_buffer->generation.batch_bos, link) {
995 list_del(&bbo->link);
996 anv_batch_bo_destroy(bbo, cmd_buffer);
997 }
998
999 /* And reset generation batch */
1000 cmd_buffer->generation.batch.allocated_batch_size = 0;
1001 cmd_buffer->generation.batch.start = NULL;
1002 cmd_buffer->generation.batch.end = NULL;
1003 cmd_buffer->generation.batch.next = NULL;
1004
1005 if (cmd_buffer->generation.ring_bo) {
1006 anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
1007 cmd_buffer->generation.ring_bo);
1008 cmd_buffer->generation.ring_bo = NULL;
1009 }
1010
1011 cmd_buffer->total_batch_size = 0;
1012 }
1013
1014 void
anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer * cmd_buffer)1015 anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
1016 {
1017 const struct intel_device_info *devinfo = cmd_buffer->device->info;
1018 struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
1019
1020 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1021 /* When we start a batch buffer, we subtract a certain amount of
1022 * padding from the end to ensure that we always have room to emit a
1023 * BATCH_BUFFER_START to chain to the next BO. We need to remove
1024 * that padding before we end the batch; otherwise, we may end up
1025 * with our BATCH_BUFFER_END in another BO.
1026 */
1027 cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
1028 assert(cmd_buffer->batch.start == batch_bo->bo->map);
1029 assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
1030
1031 /* Save end instruction location to override it later. */
1032 cmd_buffer->batch_end = cmd_buffer->batch.next;
1033
1034 /* If we can chain this command buffer to another one, leave some place
1035 * for the jump instruction.
1036 */
1037 batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer);
1038 if (batch_bo->chained)
1039 emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
1040 else
1041 anv_batch_emit(&cmd_buffer->batch, GFX9_MI_BATCH_BUFFER_END, bbe);
1042
1043 /* Round batch up to an even number of dwords. */
1044 if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)
1045 anv_batch_emit(&cmd_buffer->batch, GFX9_MI_NOOP, noop);
1046
1047 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
1048 } else {
1049 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1050 /* If this is a secondary command buffer, we need to determine the
1051 * mode in which it will be executed with vkExecuteCommands. We
1052 * determine this statically here so that this stays in sync with the
1053 * actual ExecuteCommands implementation.
1054 */
1055 const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start;
1056 if (cmd_buffer->device->physical->use_call_secondary) {
1057 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN;
1058
1059 void *jump_addr =
1060 anv_genX(devinfo, batch_emit_return)(&cmd_buffer->batch) +
1061 (GFX9_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);
1062 cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr);
1063
1064 /* The emit above may have caused us to chain batch buffers which
1065 * would mean that batch_bo is no longer valid.
1066 */
1067 batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
1068 } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) &&
1069 (length < ANV_MIN_CMD_BUFFER_BATCH_SIZE / 2)) {
1070 /* If the secondary has exactly one batch buffer in its list *and*
1071 * that batch buffer is less than half of the maximum size, we're
1072 * probably better of simply copying it into our batch.
1073 */
1074 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT;
1075 } else if (!(cmd_buffer->usage_flags &
1076 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
1077 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN;
1078
1079 /* In order to chain, we need this command buffer to contain an
1080 * MI_BATCH_BUFFER_START which will jump back to the calling batch.
1081 * It doesn't matter where it points now so long as has a valid
1082 * relocation. We'll adjust it later as part of the chaining
1083 * process.
1084 *
1085 * We set the end of the batch a little short so we would be sure we
1086 * have room for the chaining command. Since we're about to emit the
1087 * chaining command, let's set it back where it should go.
1088 */
1089 cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
1090 assert(cmd_buffer->batch.start == batch_bo->bo->map);
1091 assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
1092
1093 emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
1094 assert(cmd_buffer->batch.start == batch_bo->bo->map);
1095 } else {
1096 cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
1097 }
1098 }
1099
1100 anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
1101
1102 /* Add the current amount of data written in the current_bbo to the command
1103 * buffer.
1104 */
1105 cmd_buffer->total_batch_size += batch_bo->length;
1106 }
1107
1108 static VkResult
anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer * cmd_buffer,struct list_head * list)1109 anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer,
1110 struct list_head *list)
1111 {
1112 list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
1113 struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos);
1114 if (bbo_ptr == NULL)
1115 return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
1116
1117 *bbo_ptr = bbo;
1118 }
1119
1120 return VK_SUCCESS;
1121 }
1122
1123 void
anv_cmd_buffer_add_secondary(struct anv_cmd_buffer * primary,struct anv_cmd_buffer * secondary)1124 anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
1125 struct anv_cmd_buffer *secondary)
1126 {
1127 anv_measure_add_secondary(primary, secondary);
1128 switch (secondary->exec_mode) {
1129 case ANV_CMD_BUFFER_EXEC_MODE_EMIT:
1130 anv_batch_emit_batch(&primary->batch, &secondary->batch);
1131 break;
1132 case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: {
1133 struct anv_batch_bo *first_bbo =
1134 list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1135 struct anv_batch_bo *last_bbo =
1136 list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1137
1138 emit_batch_buffer_start(&primary->batch, first_bbo->bo, 0);
1139
1140 struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
1141 assert(primary->batch.start == this_bbo->bo->map);
1142 uint32_t offset = primary->batch.next - primary->batch.start;
1143
1144 /* Make the tail of the secondary point back to right after the
1145 * MI_BATCH_BUFFER_START in the primary batch.
1146 */
1147 anv_batch_bo_link(primary, last_bbo, this_bbo, offset);
1148
1149 anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
1150 break;
1151 }
1152 case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: {
1153 struct list_head copy_list;
1154 VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos,
1155 secondary,
1156 ©_list);
1157 if (result != VK_SUCCESS)
1158 return; /* FIXME */
1159
1160 anv_cmd_buffer_add_seen_bbos(primary, ©_list);
1161
1162 struct anv_batch_bo *first_bbo =
1163 list_first_entry(©_list, struct anv_batch_bo, link);
1164 struct anv_batch_bo *last_bbo =
1165 list_last_entry(©_list, struct anv_batch_bo, link);
1166
1167 cmd_buffer_chain_to_batch_bo(primary, first_bbo,
1168 ANV_CMD_BUFFER_BATCH_MAIN);
1169
1170 list_splicetail(©_list, &primary->batch_bos);
1171
1172 anv_batch_bo_continue(last_bbo, &primary->batch,
1173 GFX9_MI_BATCH_BUFFER_START_length * 4);
1174 break;
1175 }
1176 case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: {
1177 struct anv_batch_bo *first_bbo =
1178 list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1179
1180 anv_genX(primary->device->info, batch_emit_secondary_call)(
1181 &primary->batch,
1182 (struct anv_address) { .bo = first_bbo->bo },
1183 secondary->return_addr);
1184
1185 anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
1186 break;
1187 }
1188 default:
1189 assert(!"Invalid execution mode");
1190 }
1191
1192 anv_reloc_list_append(&primary->surface_relocs, &secondary->surface_relocs);
1193
1194 /* Add the amount of data written in the secondary buffer to the primary
1195 * command buffer.
1196 */
1197 primary->total_batch_size += secondary->total_batch_size;
1198 }
1199
1200 void
anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer ** cmd_buffers,uint32_t num_cmd_buffers)1201 anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
1202 uint32_t num_cmd_buffers)
1203 {
1204 if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
1205 assert(num_cmd_buffers == 1);
1206 return;
1207 }
1208
1209 /* Chain the N-1 first batch buffers */
1210 for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++) {
1211 assert(cmd_buffers[i]->companion_rcs_cmd_buffer == NULL);
1212 anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
1213 }
1214
1215 /* Put an end to the last one */
1216 anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
1217 }
1218
1219 static void
anv_print_batch(struct anv_device * device,struct anv_queue * queue,struct anv_cmd_buffer * cmd_buffer)1220 anv_print_batch(struct anv_device *device,
1221 struct anv_queue *queue,
1222 struct anv_cmd_buffer *cmd_buffer)
1223 {
1224 struct anv_batch_bo *bbo =
1225 list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
1226 device->cmd_buffer_being_decoded = cmd_buffer;
1227 struct intel_batch_decode_ctx *ctx = queue->decoder;
1228
1229 if (cmd_buffer->is_companion_rcs_cmd_buffer) {
1230 int render_queue_idx =
1231 anv_get_first_render_queue_index(device->physical);
1232 ctx = &device->decoder[render_queue_idx];
1233 }
1234
1235 if (INTEL_DEBUG(DEBUG_BATCH)) {
1236 intel_print_batch(ctx, bbo->bo->map,
1237 bbo->bo->size, bbo->bo->offset, false);
1238 }
1239 if (INTEL_DEBUG(DEBUG_BATCH_STATS)) {
1240 intel_batch_stats(ctx, bbo->bo->map,
1241 bbo->bo->size, bbo->bo->offset, false);
1242 }
1243 device->cmd_buffer_being_decoded = NULL;
1244 }
1245
1246 void
anv_cmd_buffer_exec_batch_debug(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass)1247 anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
1248 uint32_t cmd_buffer_count,
1249 struct anv_cmd_buffer **cmd_buffers,
1250 struct anv_query_pool *perf_query_pool,
1251 uint32_t perf_query_pass)
1252 {
1253 if (!INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS))
1254 return;
1255
1256 struct anv_device *device = queue->device;
1257 const bool has_perf_query = perf_query_pool && perf_query_pass >= 0 &&
1258 cmd_buffer_count;
1259 uint64_t frame_id = device->debug_frame_desc->frame_id;
1260
1261 if (!intel_debug_batch_in_range(device->debug_frame_desc->frame_id))
1262 return;
1263 fprintf(stderr, "Batch for frame %"PRIu64" on queue %d\n",
1264 frame_id, (int)(queue - device->queues));
1265
1266 if (cmd_buffer_count) {
1267 if (has_perf_query) {
1268 struct anv_bo *pass_batch_bo = perf_query_pool->bo;
1269 uint64_t pass_batch_offset =
1270 khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass);
1271
1272 if (INTEL_DEBUG(DEBUG_BATCH)) {
1273 intel_print_batch(queue->decoder,
1274 pass_batch_bo->map + pass_batch_offset, 64,
1275 pass_batch_bo->offset + pass_batch_offset, false);
1276 }
1277 }
1278
1279 for (uint32_t i = 0; i < cmd_buffer_count; i++)
1280 anv_print_batch(device, queue, cmd_buffers[i]);
1281 } else if (INTEL_DEBUG(DEBUG_BATCH)) {
1282 intel_print_batch(queue->decoder, device->trivial_batch_bo->map,
1283 device->trivial_batch_bo->size,
1284 device->trivial_batch_bo->offset, false);
1285 }
1286 }
1287
1288 /* We lock around execbuf for three main reasons:
1289 *
1290 * 1) When a block pool is resized, we create a new gem handle with a
1291 * different size and, in the case of surface states, possibly a different
1292 * center offset but we re-use the same anv_bo struct when we do so. If
1293 * this happens in the middle of setting up an execbuf, we could end up
1294 * with our list of BOs out of sync with our list of gem handles.
1295 *
1296 * 2) The algorithm we use for building the list of unique buffers isn't
1297 * thread-safe. While the client is supposed to synchronize around
1298 * QueueSubmit, this would be extremely difficult to debug if it ever came
1299 * up in the wild due to a broken app. It's better to play it safe and
1300 * just lock around QueueSubmit.
1301 *
1302 * Since the only other things that ever take the device lock such as block
1303 * pool resize only rarely happen, this will almost never be contended so
1304 * taking a lock isn't really an expensive operation in this case.
1305 */
1306 static inline VkResult
anv_queue_exec_locked(struct anv_queue * queue,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t signal_count,const struct vk_sync_signal * signals,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass,struct anv_utrace_submit * utrace_submit)1307 anv_queue_exec_locked(struct anv_queue *queue,
1308 uint32_t wait_count,
1309 const struct vk_sync_wait *waits,
1310 uint32_t cmd_buffer_count,
1311 struct anv_cmd_buffer **cmd_buffers,
1312 uint32_t signal_count,
1313 const struct vk_sync_signal *signals,
1314 struct anv_query_pool *perf_query_pool,
1315 uint32_t perf_query_pass,
1316 struct anv_utrace_submit *utrace_submit)
1317 {
1318 struct anv_device *device = queue->device;
1319 VkResult result = VK_SUCCESS;
1320
1321 /* We only need to synchronize the main & companion command buffers if we
1322 * have a companion command buffer somewhere in the list of command
1323 * buffers.
1324 */
1325 bool needs_companion_sync = false;
1326 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
1327 if (cmd_buffers[i]->companion_rcs_cmd_buffer != NULL) {
1328 needs_companion_sync = true;
1329 break;
1330 }
1331 }
1332
1333 result =
1334 device->kmd_backend->queue_exec_locked(
1335 queue,
1336 wait_count, waits,
1337 cmd_buffer_count, cmd_buffers,
1338 needs_companion_sync ? 0 : signal_count, signals,
1339 perf_query_pool,
1340 perf_query_pass,
1341 utrace_submit);
1342 if (result != VK_SUCCESS)
1343 return result;
1344
1345 if (needs_companion_sync) {
1346 struct vk_sync_wait companion_sync = {
1347 .sync = queue->companion_sync,
1348 };
1349 /* If any of the command buffer had a companion batch, the submission
1350 * backend will signal queue->companion_sync, so to ensure completion,
1351 * we just need to wait on that fence.
1352 */
1353 result =
1354 device->kmd_backend->queue_exec_locked(queue,
1355 1, &companion_sync,
1356 0, NULL,
1357 signal_count, signals,
1358 NULL, 0,
1359 NULL);
1360 }
1361
1362 return result;
1363 }
1364
1365 static inline bool
can_chain_query_pools(struct anv_query_pool * p1,struct anv_query_pool * p2)1366 can_chain_query_pools(struct anv_query_pool *p1, struct anv_query_pool *p2)
1367 {
1368 return (!p1 || !p2 || p1 == p2);
1369 }
1370
1371 static VkResult
anv_queue_submit_sparse_bind_locked(struct anv_queue * queue,struct vk_queue_submit * submit)1372 anv_queue_submit_sparse_bind_locked(struct anv_queue *queue,
1373 struct vk_queue_submit *submit)
1374 {
1375 struct anv_device *device = queue->device;
1376 VkResult result;
1377
1378 /* When fake sparse is enabled, while we do accept creating "sparse"
1379 * resources we can't really handle sparse submission. Fake sparse is
1380 * supposed to be used by applications that request sparse to be enabled
1381 * but don't actually *use* it.
1382 */
1383 if (!device->physical->has_sparse) {
1384 if (INTEL_DEBUG(DEBUG_SPARSE))
1385 fprintf(stderr, "=== application submitting sparse operations: "
1386 "buffer_bind:%d image_opaque_bind:%d image_bind:%d\n",
1387 submit->buffer_bind_count, submit->image_opaque_bind_count,
1388 submit->image_bind_count);
1389 return vk_queue_set_lost(&queue->vk, "Sparse binding not supported");
1390 }
1391
1392 device->using_sparse = true;
1393
1394 assert(submit->command_buffer_count == 0);
1395
1396 if (INTEL_DEBUG(DEBUG_SPARSE)) {
1397 fprintf(stderr, "[sparse submission, buffers:%u opaque_images:%u "
1398 "images:%u waits:%u signals:%u]\n",
1399 submit->buffer_bind_count,
1400 submit->image_opaque_bind_count,
1401 submit->image_bind_count,
1402 submit->wait_count, submit->signal_count);
1403 }
1404
1405 struct anv_sparse_submission sparse_submit = {
1406 .queue = queue,
1407 .binds = NULL,
1408 .binds_len = 0,
1409 .binds_capacity = 0,
1410 .wait_count = submit->wait_count,
1411 .signal_count = submit->signal_count,
1412 .waits = submit->waits,
1413 .signals = submit->signals,
1414 };
1415
1416 for (uint32_t i = 0; i < submit->buffer_bind_count; i++) {
1417 VkSparseBufferMemoryBindInfo *bind_info = &submit->buffer_binds[i];
1418 ANV_FROM_HANDLE(anv_buffer, buffer, bind_info->buffer);
1419
1420 assert(anv_buffer_is_sparse(buffer));
1421
1422 for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1423 result = anv_sparse_bind_buffer(device, buffer,
1424 &bind_info->pBinds[j],
1425 &sparse_submit);
1426 if (result != VK_SUCCESS)
1427 goto out_free_submit;
1428 }
1429 }
1430
1431 for (uint32_t i = 0; i < submit->image_bind_count; i++) {
1432 VkSparseImageMemoryBindInfo *bind_info = &submit->image_binds[i];
1433 ANV_FROM_HANDLE(anv_image, image, bind_info->image);
1434
1435 assert(anv_image_is_sparse(image));
1436 assert(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT);
1437
1438 for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1439 result = anv_sparse_bind_image_memory(queue, image,
1440 &bind_info->pBinds[j],
1441 &sparse_submit);
1442 if (result != VK_SUCCESS)
1443 goto out_free_submit;
1444 }
1445 }
1446
1447 for (uint32_t i = 0; i < submit->image_opaque_bind_count; i++) {
1448 VkSparseImageOpaqueMemoryBindInfo *bind_info =
1449 &submit->image_opaque_binds[i];
1450 ANV_FROM_HANDLE(anv_image, image, bind_info->image);
1451
1452 assert(anv_image_is_sparse(image));
1453
1454 for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1455 result = anv_sparse_bind_image_opaque(device, image,
1456 &bind_info->pBinds[j],
1457 &sparse_submit);
1458 if (result != VK_SUCCESS)
1459 goto out_free_submit;
1460 }
1461 }
1462
1463 result = anv_sparse_bind(device, &sparse_submit);
1464
1465 out_free_submit:
1466 vk_free(&device->vk.alloc, sparse_submit.binds);
1467 return result;
1468 }
1469
1470 static VkResult
anv_queue_submit_cmd_buffers_locked(struct anv_queue * queue,struct vk_queue_submit * submit,struct anv_utrace_submit * utrace_submit)1471 anv_queue_submit_cmd_buffers_locked(struct anv_queue *queue,
1472 struct vk_queue_submit *submit,
1473 struct anv_utrace_submit *utrace_submit)
1474 {
1475 VkResult result;
1476
1477 if (submit->command_buffer_count == 0) {
1478 result = anv_queue_exec_locked(queue, submit->wait_count, submit->waits,
1479 0 /* cmd_buffer_count */,
1480 NULL /* cmd_buffers */,
1481 submit->signal_count, submit->signals,
1482 NULL /* perf_query_pool */,
1483 0 /* perf_query_pass */,
1484 utrace_submit);
1485 if (result != VK_SUCCESS)
1486 return result;
1487 } else {
1488 /* Everything's easier if we don't have to bother with container_of() */
1489 STATIC_ASSERT(offsetof(struct anv_cmd_buffer, vk) == 0);
1490 struct vk_command_buffer **vk_cmd_buffers = submit->command_buffers;
1491 struct anv_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers;
1492 uint32_t start = 0;
1493 uint32_t end = submit->command_buffer_count;
1494 struct anv_query_pool *perf_query_pool =
1495 cmd_buffers[start]->perf_query_pool;
1496 for (uint32_t n = 0; n < end; n++) {
1497 bool can_chain = false;
1498 uint32_t next = n + 1;
1499 /* Can we chain the last buffer into the next one? */
1500 if (next < end &&
1501 anv_cmd_buffer_is_chainable(cmd_buffers[n]) &&
1502 anv_cmd_buffer_is_chainable(cmd_buffers[next]) &&
1503 can_chain_query_pools
1504 (cmd_buffers[next]->perf_query_pool, perf_query_pool)) {
1505 can_chain = true;
1506 perf_query_pool =
1507 perf_query_pool ? perf_query_pool :
1508 cmd_buffers[next]->perf_query_pool;
1509 }
1510 if (!can_chain) {
1511 /* The next buffer cannot be chained, or we have reached the
1512 * last buffer, submit what have been chained so far.
1513 */
1514 VkResult result =
1515 anv_queue_exec_locked(queue,
1516 start == 0 ? submit->wait_count : 0,
1517 start == 0 ? submit->waits : NULL,
1518 next - start, &cmd_buffers[start],
1519 next == end ? submit->signal_count : 0,
1520 next == end ? submit->signals : NULL,
1521 perf_query_pool,
1522 submit->perf_pass_index,
1523 next == end ? utrace_submit : NULL);
1524 if (result != VK_SUCCESS)
1525 return result;
1526 if (next < end) {
1527 start = next;
1528 perf_query_pool = cmd_buffers[start]->perf_query_pool;
1529 }
1530 }
1531 }
1532 }
1533 for (uint32_t i = 0; i < submit->signal_count; i++) {
1534 if (!vk_sync_is_anv_bo_sync(submit->signals[i].sync))
1535 continue;
1536
1537 struct anv_bo_sync *bo_sync =
1538 container_of(submit->signals[i].sync, struct anv_bo_sync, sync);
1539
1540 /* Once the execbuf has returned, we need to set the fence state to
1541 * SUBMITTED. We can't do this before calling execbuf because
1542 * anv_GetFenceStatus does take the global device lock before checking
1543 * fence->state.
1544 *
1545 * We set the fence state to SUBMITTED regardless of whether or not the
1546 * execbuf succeeds because we need to ensure that vkWaitForFences() and
1547 * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or
1548 * VK_SUCCESS) in a finite amount of time even if execbuf fails.
1549 */
1550 assert(bo_sync->state == ANV_BO_SYNC_STATE_RESET);
1551 bo_sync->state = ANV_BO_SYNC_STATE_SUBMITTED;
1552 }
1553
1554 pthread_cond_broadcast(&queue->device->queue_submit);
1555
1556 return VK_SUCCESS;
1557 }
1558
1559 VkResult
anv_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1560 anv_queue_submit(struct vk_queue *vk_queue,
1561 struct vk_queue_submit *submit)
1562 {
1563 struct anv_queue *queue = container_of(vk_queue, struct anv_queue, vk);
1564 struct anv_device *device = queue->device;
1565 VkResult result;
1566
1567 if (queue->device->info->no_hw) {
1568 for (uint32_t i = 0; i < submit->signal_count; i++) {
1569 result = vk_sync_signal(&device->vk,
1570 submit->signals[i].sync,
1571 submit->signals[i].signal_value);
1572 if (result != VK_SUCCESS)
1573 return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed");
1574 }
1575 return VK_SUCCESS;
1576 }
1577
1578 /* Flush the trace points first before taking the lock as the flushing
1579 * might try to take that same lock.
1580 */
1581 struct anv_utrace_submit *utrace_submit = NULL;
1582 result = anv_device_utrace_flush_cmd_buffers(
1583 queue,
1584 submit->command_buffer_count,
1585 (struct anv_cmd_buffer **)submit->command_buffers,
1586 &utrace_submit);
1587 if (result != VK_SUCCESS)
1588 return result;
1589
1590 pthread_mutex_lock(&device->mutex);
1591
1592 uint64_t start_ts = intel_ds_begin_submit(&queue->ds);
1593
1594 if (submit->buffer_bind_count ||
1595 submit->image_opaque_bind_count ||
1596 submit->image_bind_count) {
1597 result = anv_queue_submit_sparse_bind_locked(queue, submit);
1598 } else {
1599 result = anv_queue_submit_cmd_buffers_locked(queue, submit,
1600 utrace_submit);
1601 }
1602
1603 /* Take submission ID under lock */
1604 intel_ds_end_submit(&queue->ds, start_ts);
1605
1606 pthread_mutex_unlock(&device->mutex);
1607
1608 intel_ds_device_process(&device->ds, true);
1609
1610 return result;
1611 }
1612
1613 VkResult
anv_queue_submit_simple_batch(struct anv_queue * queue,struct anv_batch * batch,bool is_companion_rcs_batch)1614 anv_queue_submit_simple_batch(struct anv_queue *queue,
1615 struct anv_batch *batch,
1616 bool is_companion_rcs_batch)
1617 {
1618 struct anv_device *device = queue->device;
1619 VkResult result = VK_SUCCESS;
1620
1621 if (anv_batch_has_error(batch))
1622 return batch->status;
1623
1624 if (queue->device->info->no_hw)
1625 return VK_SUCCESS;
1626
1627 /* This is only used by device init so we can assume the queue is empty and
1628 * we aren't fighting with a submit thread.
1629 */
1630 assert(vk_queue_is_empty(&queue->vk));
1631
1632 uint32_t batch_size = align(batch->next - batch->start, 8);
1633
1634 struct anv_bo *batch_bo = NULL;
1635 result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, &batch_bo);
1636 if (result != VK_SUCCESS)
1637 return result;
1638
1639 memcpy(batch_bo->map, batch->start, batch_size);
1640 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
1641 if (device->physical->memory.need_flush &&
1642 anv_bo_needs_host_cache_flush(batch_bo->alloc_flags))
1643 intel_flush_range(batch_bo->map, batch_size);
1644 #endif
1645
1646 if (INTEL_DEBUG(DEBUG_BATCH) &&
1647 intel_debug_batch_in_range(device->debug_frame_desc->frame_id)) {
1648 int render_queue_idx =
1649 anv_get_first_render_queue_index(device->physical);
1650 struct intel_batch_decode_ctx *ctx = is_companion_rcs_batch ?
1651 &device->decoder[render_queue_idx] :
1652 queue->decoder;
1653 intel_print_batch(ctx, batch_bo->map, batch_bo->size, batch_bo->offset,
1654 false);
1655 }
1656
1657 result = device->kmd_backend->execute_simple_batch(queue, batch_bo,
1658 batch_size,
1659 is_companion_rcs_batch);
1660
1661 anv_bo_pool_free(&device->batch_bo_pool, batch_bo);
1662
1663 return result;
1664 }
1665
1666 VkResult
anv_queue_submit_trtt_batch(struct anv_sparse_submission * submit,struct anv_batch * batch)1667 anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
1668 struct anv_batch *batch)
1669 {
1670 struct anv_queue *queue = submit->queue;
1671 struct anv_device *device = queue->device;
1672 VkResult result = VK_SUCCESS;
1673
1674 uint32_t batch_size = align(batch->next - batch->start, 8);
1675 struct anv_trtt_batch_bo *trtt_bbo;
1676 result = anv_trtt_batch_bo_new(device, batch_size, &trtt_bbo);
1677 if (result != VK_SUCCESS)
1678 return result;
1679
1680 memcpy(trtt_bbo->bo->map, batch->start, trtt_bbo->size);
1681 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
1682 if (device->physical->memory.need_flush &&
1683 anv_bo_needs_host_cache_flush(trtt_bbo->bo->alloc_flags))
1684 intel_flush_range(trtt_bbo->bo->map, trtt_bbo->size);
1685 #endif
1686
1687 if (INTEL_DEBUG(DEBUG_BATCH)) {
1688 intel_print_batch(queue->decoder, trtt_bbo->bo->map, trtt_bbo->bo->size,
1689 trtt_bbo->bo->offset, false);
1690 }
1691
1692 result = device->kmd_backend->execute_trtt_batch(submit, trtt_bbo);
1693
1694 return result;
1695 }
1696
1697 void
anv_cmd_buffer_clflush(struct anv_cmd_buffer ** cmd_buffers,uint32_t num_cmd_buffers)1698 anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
1699 uint32_t num_cmd_buffers)
1700 {
1701 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
1702 struct anv_batch_bo **bbo;
1703
1704 __builtin_ia32_mfence();
1705
1706 for (uint32_t i = 0; i < num_cmd_buffers; i++) {
1707 u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
1708 intel_flush_range_no_fence((*bbo)->bo->map, (*bbo)->length);
1709 }
1710 }
1711
1712 __builtin_ia32_mfence();
1713 #endif
1714 }
1715