• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 
30 #include <xf86drm.h>
31 
32 #include "anv_private.h"
33 #include "anv_measure.h"
34 
35 #include "genxml/gen9_pack.h"
36 #include "genxml/genX_bits.h"
37 
38 #include "util/perf/u_trace.h"
39 
40 /** \file anv_batch_chain.c
41  *
42  * This file contains functions related to anv_cmd_buffer as a data
43  * structure.  This involves everything required to create and destroy
44  * the actual batch buffers as well as link them together.
45  *
46  * It specifically does *not* contain any handling of actual vkCmd calls
47  * beyond vkCmdExecuteCommands.
48  */
49 
50 /*-----------------------------------------------------------------------*
51  * Functions related to anv_reloc_list
52  *-----------------------------------------------------------------------*/
53 
54 VkResult
anv_reloc_list_init(struct anv_reloc_list * list,const VkAllocationCallbacks * alloc,bool uses_relocs)55 anv_reloc_list_init(struct anv_reloc_list *list,
56                     const VkAllocationCallbacks *alloc,
57                     bool uses_relocs)
58 {
59    assert(alloc != NULL);
60    memset(list, 0, sizeof(*list));
61    list->uses_relocs = uses_relocs;
62    list->alloc = alloc;
63    return VK_SUCCESS;
64 }
65 
66 static VkResult
anv_reloc_list_init_clone(struct anv_reloc_list * list,const struct anv_reloc_list * other_list)67 anv_reloc_list_init_clone(struct anv_reloc_list *list,
68                           const struct anv_reloc_list *other_list)
69 {
70    list->dep_words = other_list->dep_words;
71 
72    if (list->dep_words > 0) {
73       list->deps =
74          vk_alloc(list->alloc, list->dep_words * sizeof(BITSET_WORD), 8,
75                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
76       memcpy(list->deps, other_list->deps,
77              list->dep_words * sizeof(BITSET_WORD));
78    } else {
79       list->deps = NULL;
80    }
81 
82    return VK_SUCCESS;
83 }
84 
85 void
anv_reloc_list_finish(struct anv_reloc_list * list)86 anv_reloc_list_finish(struct anv_reloc_list *list)
87 {
88    vk_free(list->alloc, list->deps);
89 }
90 
91 static VkResult
anv_reloc_list_grow_deps(struct anv_reloc_list * list,uint32_t min_num_words)92 anv_reloc_list_grow_deps(struct anv_reloc_list *list,
93                          uint32_t min_num_words)
94 {
95    if (min_num_words <= list->dep_words)
96       return VK_SUCCESS;
97 
98    uint32_t new_length = MAX2(32, list->dep_words * 2);
99    while (new_length < min_num_words)
100       new_length *= 2;
101 
102    BITSET_WORD *new_deps =
103       vk_realloc(list->alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,
104                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
105    if (new_deps == NULL)
106       return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
107    list->deps = new_deps;
108 
109    /* Zero out the new data */
110    memset(list->deps + list->dep_words, 0,
111           (new_length - list->dep_words) * sizeof(BITSET_WORD));
112    list->dep_words = new_length;
113 
114    return VK_SUCCESS;
115 }
116 
117 VkResult
anv_reloc_list_add_bo_impl(struct anv_reloc_list * list,struct anv_bo * target_bo)118 anv_reloc_list_add_bo_impl(struct anv_reloc_list *list,
119                            struct anv_bo *target_bo)
120 {
121    /* This can happen with sparse resources. */
122    if (!target_bo)
123       return VK_SUCCESS;
124 
125    uint32_t idx = target_bo->gem_handle;
126    VkResult result = anv_reloc_list_grow_deps(list,
127                                               (idx / BITSET_WORDBITS) + 1);
128    if (unlikely(result != VK_SUCCESS))
129       return result;
130 
131    BITSET_SET(list->deps, idx);
132 
133    return VK_SUCCESS;
134 }
135 
136 static void
anv_reloc_list_clear(struct anv_reloc_list * list)137 anv_reloc_list_clear(struct anv_reloc_list *list)
138 {
139    if (list->dep_words > 0)
140       memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
141 }
142 
143 VkResult
anv_reloc_list_append(struct anv_reloc_list * list,struct anv_reloc_list * other)144 anv_reloc_list_append(struct anv_reloc_list *list,
145                       struct anv_reloc_list *other)
146 {
147    anv_reloc_list_grow_deps(list, other->dep_words);
148    for (uint32_t w = 0; w < other->dep_words; w++)
149       list->deps[w] |= other->deps[w];
150 
151    return VK_SUCCESS;
152 }
153 
154 /*-----------------------------------------------------------------------*
155  * Functions related to anv_batch
156  *-----------------------------------------------------------------------*/
157 
158 static VkResult
anv_extend_batch(struct anv_batch * batch,uint32_t size)159 anv_extend_batch(struct anv_batch *batch, uint32_t size)
160 {
161    assert(batch->extend_cb != NULL);
162    VkResult result = batch->extend_cb(batch, size, batch->user_data);
163    if (result != VK_SUCCESS)
164       return anv_batch_set_error(batch, result);
165    return result;
166 }
167 
168 void *
anv_batch_emit_dwords(struct anv_batch * batch,int num_dwords)169 anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
170 {
171    uint32_t size = num_dwords * 4;
172    if (batch->next + size > batch->end) {
173       if (anv_extend_batch(batch, size) != VK_SUCCESS)
174          return NULL;
175    }
176 
177    void *p = batch->next;
178 
179    batch->next += num_dwords * 4;
180    assert(batch->next <= batch->end);
181 
182    return p;
183 }
184 
185 /* Ensure enough contiguous space is available */
186 VkResult
anv_batch_emit_ensure_space(struct anv_batch * batch,uint32_t size)187 anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size)
188 {
189    if (batch->next + size > batch->end) {
190       VkResult result = anv_extend_batch(batch, size);
191       if (result != VK_SUCCESS)
192          return result;
193    }
194 
195    assert(batch->next + size <= batch->end);
196 
197    return VK_SUCCESS;
198 }
199 
200 void
anv_batch_advance(struct anv_batch * batch,uint32_t size)201 anv_batch_advance(struct anv_batch *batch, uint32_t size)
202 {
203    assert(batch->next + size <= batch->end);
204 
205    batch->next += size;
206 }
207 
208 struct anv_address
anv_batch_address(struct anv_batch * batch,void * batch_location)209 anv_batch_address(struct anv_batch *batch, void *batch_location)
210 {
211    assert(batch->start <= batch_location);
212 
213    /* Allow a jump at the current location of the batch. */
214    assert(batch->next >= batch_location);
215 
216    return anv_address_add(batch->start_addr, batch_location - batch->start);
217 }
218 
219 void
anv_batch_emit_batch(struct anv_batch * batch,struct anv_batch * other)220 anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
221 {
222    uint32_t size = other->next - other->start;
223    assert(size % 4 == 0);
224 
225    if (batch->next + size > batch->end) {
226       if (anv_extend_batch(batch, size) != VK_SUCCESS)
227          return;
228    }
229 
230    assert(batch->next + size <= batch->end);
231 
232    VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size));
233    memcpy(batch->next, other->start, size);
234 
235    VkResult result = anv_reloc_list_append(batch->relocs, other->relocs);
236    if (result != VK_SUCCESS) {
237       anv_batch_set_error(batch, result);
238       return;
239    }
240 
241    batch->next += size;
242 }
243 
244 /*-----------------------------------------------------------------------*
245  * Functions related to anv_batch_bo
246  *-----------------------------------------------------------------------*/
247 
248 static VkResult
anv_batch_bo_create(struct anv_cmd_buffer * cmd_buffer,uint32_t size,struct anv_batch_bo ** bbo_out)249 anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
250                     uint32_t size,
251                     struct anv_batch_bo **bbo_out)
252 {
253    VkResult result;
254 
255    struct anv_batch_bo *bbo = vk_zalloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
256                                         8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
257    if (bbo == NULL)
258       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
259 
260    result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
261                               size, &bbo->bo);
262    if (result != VK_SUCCESS)
263       goto fail_alloc;
264 
265    const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
266    result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->vk.pool->alloc, uses_relocs);
267    if (result != VK_SUCCESS)
268       goto fail_bo_alloc;
269 
270    *bbo_out = bbo;
271 
272    return VK_SUCCESS;
273 
274  fail_bo_alloc:
275    anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
276  fail_alloc:
277    vk_free(&cmd_buffer->vk.pool->alloc, bbo);
278 
279    return result;
280 }
281 
282 static VkResult
anv_batch_bo_clone(struct anv_cmd_buffer * cmd_buffer,const struct anv_batch_bo * other_bbo,struct anv_batch_bo ** bbo_out)283 anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
284                    const struct anv_batch_bo *other_bbo,
285                    struct anv_batch_bo **bbo_out)
286 {
287    VkResult result;
288 
289    struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
290                                         8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
291    if (bbo == NULL)
292       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
293 
294    result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
295                               other_bbo->bo->size, &bbo->bo);
296    if (result != VK_SUCCESS)
297       goto fail_alloc;
298 
299    result = anv_reloc_list_init_clone(&bbo->relocs, &other_bbo->relocs);
300    if (result != VK_SUCCESS)
301       goto fail_bo_alloc;
302 
303    bbo->length = other_bbo->length;
304    memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length);
305    *bbo_out = bbo;
306 
307    return VK_SUCCESS;
308 
309  fail_bo_alloc:
310    anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
311  fail_alloc:
312    vk_free(&cmd_buffer->vk.pool->alloc, bbo);
313 
314    return result;
315 }
316 
317 static void
anv_batch_bo_start(struct anv_batch_bo * bbo,struct anv_batch * batch,size_t batch_padding)318 anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch,
319                    size_t batch_padding)
320 {
321    anv_batch_set_storage(batch, (struct anv_address) { .bo = bbo->bo, },
322                          bbo->bo->map, bbo->bo->size - batch_padding);
323    batch->relocs = &bbo->relocs;
324    anv_reloc_list_clear(&bbo->relocs);
325 }
326 
327 static void
anv_batch_bo_continue(struct anv_batch_bo * bbo,struct anv_batch * batch,size_t batch_padding)328 anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch,
329                       size_t batch_padding)
330 {
331    batch->start_addr = (struct anv_address) { .bo = bbo->bo, };
332    batch->start = bbo->bo->map;
333    batch->next = bbo->bo->map + bbo->length;
334    batch->end = bbo->bo->map + bbo->bo->size - batch_padding;
335    batch->relocs = &bbo->relocs;
336 }
337 
338 static void
anv_batch_bo_finish(struct anv_batch_bo * bbo,struct anv_batch * batch)339 anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)
340 {
341    assert(batch->start == bbo->bo->map);
342    bbo->length = batch->next - batch->start;
343    VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length));
344 }
345 
346 static void
anv_batch_bo_link(struct anv_cmd_buffer * cmd_buffer,struct anv_batch_bo * prev_bbo,struct anv_batch_bo * next_bbo,uint32_t next_bbo_offset)347 anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
348                   struct anv_batch_bo *prev_bbo,
349                   struct anv_batch_bo *next_bbo,
350                   uint32_t next_bbo_offset)
351 {
352    const uint32_t bb_start_offset =
353       prev_bbo->length - GFX9_MI_BATCH_BUFFER_START_length * 4;
354    ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset;
355 
356    /* Make sure we're looking at a MI_BATCH_BUFFER_START */
357    assert(((*bb_start >> 29) & 0x07) == 0);
358    assert(((*bb_start >> 23) & 0x3f) == 49);
359 
360    uint64_t *map = prev_bbo->bo->map + bb_start_offset + 4;
361    *map = intel_canonical_address(next_bbo->bo->offset + next_bbo_offset);
362 
363 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
364    if (cmd_buffer->device->physical->memory.need_flush &&
365        anv_bo_needs_host_cache_flush(prev_bbo->bo->alloc_flags))
366       intel_flush_range(map, sizeof(uint64_t));
367 #endif
368 }
369 
370 static void
anv_batch_bo_destroy(struct anv_batch_bo * bbo,struct anv_cmd_buffer * cmd_buffer)371 anv_batch_bo_destroy(struct anv_batch_bo *bbo,
372                      struct anv_cmd_buffer *cmd_buffer)
373 {
374    anv_reloc_list_finish(&bbo->relocs);
375    anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
376    vk_free(&cmd_buffer->vk.pool->alloc, bbo);
377 }
378 
379 static VkResult
anv_batch_bo_list_clone(const struct list_head * list,struct anv_cmd_buffer * cmd_buffer,struct list_head * new_list)380 anv_batch_bo_list_clone(const struct list_head *list,
381                         struct anv_cmd_buffer *cmd_buffer,
382                         struct list_head *new_list)
383 {
384    VkResult result = VK_SUCCESS;
385 
386    list_inithead(new_list);
387 
388    struct anv_batch_bo *prev_bbo = NULL;
389    list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
390       struct anv_batch_bo *new_bbo = NULL;
391       result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo);
392       if (result != VK_SUCCESS)
393          break;
394       list_addtail(&new_bbo->link, new_list);
395 
396       if (prev_bbo)
397          anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0);
398 
399       prev_bbo = new_bbo;
400    }
401 
402    if (result != VK_SUCCESS) {
403       list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) {
404          list_del(&bbo->link);
405          anv_batch_bo_destroy(bbo, cmd_buffer);
406       }
407    }
408 
409    return result;
410 }
411 
412 /*-----------------------------------------------------------------------*
413  * Functions related to anv_batch_bo
414  *-----------------------------------------------------------------------*/
415 
416 static struct anv_batch_bo *
anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer * cmd_buffer)417 anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)
418 {
419    return list_entry(cmd_buffer->batch_bos.prev, struct anv_batch_bo, link);
420 }
421 
422 static struct anv_batch_bo *
anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer * cmd_buffer)423 anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer *cmd_buffer)
424 {
425    return list_entry(cmd_buffer->generation.batch_bos.prev, struct anv_batch_bo, link);
426 }
427 
428 struct anv_address
anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer * cmd_buffer)429 anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
430 {
431    /* Only graphics & compute queues need binding tables. */
432    if (!(cmd_buffer->queue_family->queueFlags & (VK_QUEUE_GRAPHICS_BIT |
433                                                  VK_QUEUE_COMPUTE_BIT)))
434       return ANV_NULL_ADDRESS;
435 
436    /* If we've never allocated a binding table block, do it now. Otherwise we
437     * would trigger another STATE_BASE_ADDRESS emission which would require an
438     * additional bunch of flushes/stalls.
439     */
440    if (u_vector_length(&cmd_buffer->bt_block_states) == 0) {
441       VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
442       if (result != VK_SUCCESS) {
443          anv_batch_set_error(&cmd_buffer->batch, result);
444          return ANV_NULL_ADDRESS;
445       }
446    }
447 
448    struct anv_state_pool *pool = &cmd_buffer->device->binding_table_pool;
449    struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
450    return (struct anv_address) {
451       .bo = pool->block_pool.bo,
452       .offset = bt_block->offset - pool->start_offset,
453    };
454 }
455 
456 static void
emit_batch_buffer_start(struct anv_batch * batch,struct anv_bo * bo,uint32_t offset)457 emit_batch_buffer_start(struct anv_batch *batch,
458                         struct anv_bo *bo, uint32_t offset)
459 {
460    anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
461       bbs.DWordLength               = GFX9_MI_BATCH_BUFFER_START_length -
462                                       GFX9_MI_BATCH_BUFFER_START_length_bias;
463       bbs.SecondLevelBatchBuffer    = Firstlevelbatch;
464       bbs.AddressSpaceIndicator     = ASI_PPGTT;
465       bbs.BatchBufferStartAddress   = (struct anv_address) { bo, offset };
466    }
467 }
468 
469 enum anv_cmd_buffer_batch {
470    ANV_CMD_BUFFER_BATCH_MAIN,
471    ANV_CMD_BUFFER_BATCH_GENERATION,
472 };
473 
474 static void
cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_batch_bo * bbo,enum anv_cmd_buffer_batch batch_type)475 cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
476                              struct anv_batch_bo *bbo,
477                              enum anv_cmd_buffer_batch batch_type)
478 {
479    struct anv_batch *batch =
480       batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
481       &cmd_buffer->generation.batch : &cmd_buffer->batch;
482    struct anv_batch_bo *current_bbo =
483       batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
484       anv_cmd_buffer_current_generation_batch_bo(cmd_buffer) :
485       anv_cmd_buffer_current_batch_bo(cmd_buffer);
486 
487    /* We set the end of the batch a little short so we would be sure we
488     * have room for the chaining command.  Since we're about to emit the
489     * chaining command, let's set it back where it should go.
490     */
491    batch->end += GFX9_MI_BATCH_BUFFER_START_length * 4;
492    assert(batch->end == current_bbo->bo->map + current_bbo->bo->size);
493 
494    emit_batch_buffer_start(batch, bbo->bo, 0);
495 
496    anv_batch_bo_finish(current_bbo, batch);
497 
498    /* Add the current amount of data written in the current_bbo to the command
499     * buffer.
500     */
501    cmd_buffer->total_batch_size += current_bbo->length;
502 }
503 
504 static void
anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer * cmd_buffer_from,struct anv_cmd_buffer * cmd_buffer_to)505 anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
506                                    struct anv_cmd_buffer *cmd_buffer_to)
507 {
508    uint32_t *bb_start = cmd_buffer_from->batch_end;
509 
510    struct anv_batch_bo *last_bbo =
511       list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link);
512    struct anv_batch_bo *first_bbo =
513       list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);
514 
515    struct GFX9_MI_BATCH_BUFFER_START gen_bb_start = {
516       __anv_cmd_header(GFX9_MI_BATCH_BUFFER_START),
517       .SecondLevelBatchBuffer    = Firstlevelbatch,
518       .AddressSpaceIndicator     = ASI_PPGTT,
519       .BatchBufferStartAddress   = (struct anv_address) { first_bbo->bo, 0 },
520    };
521    struct anv_batch local_batch = {
522       .start  = last_bbo->bo->map,
523       .end    = last_bbo->bo->map + last_bbo->bo->size,
524       .relocs = &last_bbo->relocs,
525       .alloc  = &cmd_buffer_from->vk.pool->alloc,
526    };
527 
528    __anv_cmd_pack(GFX9_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
529 
530    last_bbo->chained = true;
531 }
532 
533 static void
anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer * cmd_buffer)534 anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)
535 {
536    struct anv_batch_bo *last_bbo =
537       list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
538    last_bbo->chained = false;
539 
540    uint32_t *batch = cmd_buffer->batch_end;
541    anv_pack_struct(batch, GFX9_MI_BATCH_BUFFER_END,
542                    __anv_cmd_header(GFX9_MI_BATCH_BUFFER_END));
543 }
544 
545 static VkResult
anv_cmd_buffer_chain_batch(struct anv_batch * batch,uint32_t size,void * _data)546 anv_cmd_buffer_chain_batch(struct anv_batch *batch, uint32_t size, void *_data)
547 {
548    /* The caller should not need that much space. Otherwise it should split
549     * its commands.
550     */
551    assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
552 
553    struct anv_cmd_buffer *cmd_buffer = _data;
554    struct anv_batch_bo *new_bbo = NULL;
555    /* Amount of reserved space at the end of the batch to account for the
556     * chaining instruction.
557     */
558    const uint32_t batch_padding = GFX9_MI_BATCH_BUFFER_START_length * 4;
559    /* Cap reallocation to chunk. */
560    uint32_t alloc_size = MIN2(
561       MAX2(batch->allocated_batch_size, size + batch_padding),
562       ANV_MAX_CMD_BUFFER_BATCH_SIZE);
563 
564    VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
565    if (result != VK_SUCCESS)
566       return result;
567 
568    batch->allocated_batch_size += alloc_size;
569 
570    struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
571    if (seen_bbo == NULL) {
572       anv_batch_bo_destroy(new_bbo, cmd_buffer);
573       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
574    }
575    *seen_bbo = new_bbo;
576 
577    cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo, ANV_CMD_BUFFER_BATCH_MAIN);
578 
579    list_addtail(&new_bbo->link, &cmd_buffer->batch_bos);
580 
581    anv_batch_bo_start(new_bbo, batch, batch_padding);
582 
583    return VK_SUCCESS;
584 }
585 
586 static VkResult
anv_cmd_buffer_chain_generation_batch(struct anv_batch * batch,uint32_t size,void * _data)587 anv_cmd_buffer_chain_generation_batch(struct anv_batch *batch, uint32_t size, void *_data)
588 {
589    /* The caller should not need that much space. Otherwise it should split
590     * its commands.
591     */
592    assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
593 
594    struct anv_cmd_buffer *cmd_buffer = _data;
595    struct anv_batch_bo *new_bbo = NULL;
596    /* Cap reallocation to chunk. */
597    uint32_t alloc_size = MIN2(
598       MAX2(batch->allocated_batch_size, size),
599       ANV_MAX_CMD_BUFFER_BATCH_SIZE);
600 
601    VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
602    if (result != VK_SUCCESS)
603       return result;
604 
605    batch->allocated_batch_size += alloc_size;
606 
607    struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
608    if (seen_bbo == NULL) {
609       anv_batch_bo_destroy(new_bbo, cmd_buffer);
610       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
611    }
612    *seen_bbo = new_bbo;
613 
614    if (!list_is_empty(&cmd_buffer->generation.batch_bos)) {
615       cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo,
616                                    ANV_CMD_BUFFER_BATCH_GENERATION);
617    }
618 
619    list_addtail(&new_bbo->link, &cmd_buffer->generation.batch_bos);
620 
621    anv_batch_bo_start(new_bbo, batch, GFX9_MI_BATCH_BUFFER_START_length * 4);
622 
623    return VK_SUCCESS;
624 }
625 
626 /** Allocate a binding table
627  *
628  * This function allocates a binding table.  This is a bit more complicated
629  * than one would think due to a combination of Vulkan driver design and some
630  * unfortunate hardware restrictions.
631  *
632  * The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for
633  * the binding table pointer which means that all binding tables need to live
634  * in the bottom 64k of surface state base address.  The way the GL driver has
635  * classically dealt with this restriction is to emit all surface states
636  * on-the-fly into the batch and have a batch buffer smaller than 64k.  This
637  * isn't really an option in Vulkan for a couple of reasons:
638  *
639  *  1) In Vulkan, we have growing (or chaining) batches so surface states have
640  *     to live in their own buffer and we have to be able to re-emit
641  *     STATE_BASE_ADDRESS as needed which requires a full pipeline stall.  In
642  *     order to avoid emitting STATE_BASE_ADDRESS any more often than needed
643  *     (it's not that hard to hit 64k of just binding tables), we allocate
644  *     surface state objects up-front when VkImageView is created.  In order
645  *     for this to work, surface state objects need to be allocated from a
646  *     global buffer.
647  *
648  *  2) We tried to design the surface state system in such a way that it's
649  *     already ready for bindless texturing.  The way bindless texturing works
650  *     on our hardware is that you have a big pool of surface state objects
651  *     (with its own state base address) and the bindless handles are simply
652  *     offsets into that pool.  With the architecture we chose, we already
653  *     have that pool and it's exactly the same pool that we use for regular
654  *     surface states so we should already be ready for bindless.
655  *
656  *  3) For render targets, we need to be able to fill out the surface states
657  *     later in vkBeginRenderPass so that we can assign clear colors
658  *     correctly.  One way to do this would be to just create the surface
659  *     state data and then repeatedly copy it into the surface state BO every
660  *     time we have to re-emit STATE_BASE_ADDRESS.  While this works, it's
661  *     rather annoying and just being able to allocate them up-front and
662  *     re-use them for the entire render pass.
663  *
664  * While none of these are technically blockers for emitting state on the fly
665  * like we do in GL, the ability to have a single surface state pool is
666  * simplifies things greatly.  Unfortunately, it comes at a cost...
667  *
668  * Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't
669  * place the binding tables just anywhere in surface state base address.
670  * Because 64k isn't a whole lot of space, we can't simply restrict the
671  * surface state buffer to 64k, we have to be more clever.  The solution we've
672  * chosen is to have a block pool with a maximum size of 2G that starts at
673  * zero and grows in both directions.  All surface states are allocated from
674  * the top of the pool (positive offsets) and we allocate blocks (< 64k) of
675  * binding tables from the bottom of the pool (negative offsets).  Every time
676  * we allocate a new binding table block, we set surface state base address to
677  * point to the bottom of the binding table block.  This way all of the
678  * binding tables in the block are in the bottom 64k of surface state base
679  * address.  When we fill out the binding table, we add the distance between
680  * the bottom of our binding table block and zero of the block pool to the
681  * surface state offsets so that they are correct relative to out new surface
682  * state base address at the bottom of the binding table block.
683  *
684  * \param[in]  entries        The number of surface state entries the binding
685  *                            table should be able to hold.
686  *
687  * \param[out] state_offset   The offset surface surface state base address
688  *                            where the surface states live.  This must be
689  *                            added to the surface state offset when it is
690  *                            written into the binding table entry.
691  *
692  * \return                    An anv_state representing the binding table
693  */
694 struct anv_state
anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer * cmd_buffer,uint32_t entries,uint32_t * state_offset)695 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
696                                    uint32_t entries, uint32_t *state_offset)
697 {
698    if (u_vector_length(&cmd_buffer->bt_block_states) == 0)
699       return (struct anv_state) { 0 };
700 
701    struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
702 
703    uint32_t bt_size = align(entries * 4, 32);
704 
705    struct anv_state state = cmd_buffer->bt_next;
706    if (bt_size > state.alloc_size)
707       return (struct anv_state) { 0 };
708 
709    state.alloc_size = bt_size;
710    cmd_buffer->bt_next.offset += bt_size;
711    cmd_buffer->bt_next.map += bt_size;
712    cmd_buffer->bt_next.alloc_size -= bt_size;
713 
714    if (cmd_buffer->device->info->verx10 >= 125) {
715       /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to change the binding
716        * table address independently from surface state base address.  We no
717        * longer need any sort of offsetting.
718        */
719       *state_offset = 0;
720    } else {
721       assert(bt_block->offset < 0);
722       *state_offset = -bt_block->offset;
723    }
724 
725    return state;
726 }
727 
728 struct anv_state
anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer * cmd_buffer,uint32_t count)729 anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer *cmd_buffer,
730                                     uint32_t count)
731 {
732    if (count == 0)
733       return ANV_STATE_NULL;
734    struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
735    struct anv_state state =
736       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
737                              count * isl_dev->ss.size,
738                              isl_dev->ss.align);
739    if (state.map == NULL)
740       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
741    return state;
742 }
743 
744 struct anv_state
anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer * cmd_buffer,uint32_t size,uint32_t alignment)745 anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
746                                    uint32_t size, uint32_t alignment)
747 {
748    if (size == 0)
749       return ANV_STATE_NULL;
750    struct anv_state state =
751       anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
752                              size, alignment);
753    if (state.map == NULL)
754       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
755    return state;
756 }
757 
758 struct anv_state
anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer * cmd_buffer,uint32_t size,uint32_t alignment)759 anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer *cmd_buffer,
760                                    uint32_t size, uint32_t alignment)
761 {
762    if (size == 0)
763       return ANV_STATE_NULL;
764    struct anv_state state =
765       anv_state_stream_alloc(&cmd_buffer->general_state_stream,
766                              size, alignment);
767    if (state.map == NULL)
768       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
769    return state;
770 }
771 
772 /** Allocate space associated with a command buffer
773  *
774  * Some commands like vkCmdBuildAccelerationStructuresKHR() can end up needing
775  * large amount of temporary buffers. This function is here to deal with those
776  * potentially larger allocations, using a side BO if needed.
777  *
778  */
779 struct anv_cmd_alloc
anv_cmd_buffer_alloc_space(struct anv_cmd_buffer * cmd_buffer,size_t size,uint32_t alignment,bool mapped)780 anv_cmd_buffer_alloc_space(struct anv_cmd_buffer *cmd_buffer,
781                            size_t size, uint32_t alignment,
782                            bool mapped)
783 {
784    /* Below 16k, source memory from dynamic state, otherwise allocate a BO. */
785    if (size < 16 * 1024) {
786       struct anv_state state =
787          anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
788                                 size, alignment);
789       if (state.map == NULL) {
790          anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
791          return (struct anv_cmd_alloc) {
792             .address = ANV_NULL_ADDRESS,
793          };
794       }
795 
796       return (struct anv_cmd_alloc) {
797          .address = anv_state_pool_state_address(
798             &cmd_buffer->device->dynamic_state_pool,
799             state),
800          .map = state.map,
801          .size = size,
802       };
803    }
804 
805    assert(alignment <= 4096);
806 
807    struct anv_bo *bo = NULL;
808    VkResult result =
809       anv_bo_pool_alloc(mapped ?
810                         &cmd_buffer->device->batch_bo_pool :
811                         &cmd_buffer->device->bvh_bo_pool,
812                         align(size, 4096), &bo);
813    if (result != VK_SUCCESS) {
814       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
815       return ANV_EMPTY_ALLOC;
816    }
817 
818    struct anv_bo **bo_entry =
819       u_vector_add(&cmd_buffer->dynamic_bos);
820    if (bo_entry == NULL) {
821       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
822       anv_bo_pool_free(bo->map != NULL ?
823                        &cmd_buffer->device->batch_bo_pool :
824                        &cmd_buffer->device->bvh_bo_pool, bo);
825       return ANV_EMPTY_ALLOC;
826    }
827    *bo_entry = bo;
828 
829    return (struct anv_cmd_alloc) {
830       .address = (struct anv_address) { .bo = bo },
831       .map = bo->map,
832       .size = size,
833    };
834 }
835 
836 VkResult
anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer * cmd_buffer)837 anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
838 {
839    struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);
840    if (bt_block == NULL) {
841       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
842       return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
843    }
844 
845    *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device);
846 
847    /* The bt_next state is a rolling state (we update it as we suballocate
848     * from it) which is relative to the start of the binding table block.
849     */
850    cmd_buffer->bt_next = *bt_block;
851    cmd_buffer->bt_next.offset = 0;
852 
853    return VK_SUCCESS;
854 }
855 
856 VkResult
anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)857 anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
858 {
859    struct anv_batch_bo *batch_bo = NULL;
860    VkResult result;
861 
862    list_inithead(&cmd_buffer->batch_bos);
863 
864    cmd_buffer->total_batch_size = 0;
865 
866    result = anv_batch_bo_create(cmd_buffer,
867                                 ANV_MIN_CMD_BUFFER_BATCH_SIZE,
868                                 &batch_bo);
869    if (result != VK_SUCCESS)
870       return result;
871 
872    list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);
873 
874    cmd_buffer->batch.alloc = &cmd_buffer->vk.pool->alloc;
875    cmd_buffer->batch.user_data = cmd_buffer;
876    cmd_buffer->batch.allocated_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;
877 
878    cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
879    cmd_buffer->batch.engine_class = cmd_buffer->queue_family->engine_class;
880 
881    anv_batch_bo_start(batch_bo, &cmd_buffer->batch,
882                       GFX9_MI_BATCH_BUFFER_START_length * 4);
883 
884    /* Generation batch is initialized empty since it's possible it won't be
885     * used.
886     */
887    list_inithead(&cmd_buffer->generation.batch_bos);
888 
889    cmd_buffer->generation.batch.alloc = &cmd_buffer->vk.pool->alloc;
890    cmd_buffer->generation.batch.user_data = cmd_buffer;
891    cmd_buffer->generation.batch.allocated_batch_size = 0;
892    cmd_buffer->generation.batch.extend_cb = anv_cmd_buffer_chain_generation_batch;
893    cmd_buffer->generation.batch.engine_class =
894       cmd_buffer->queue_family->engine_class;
895 
896    int success = u_vector_init_pow2(&cmd_buffer->seen_bbos, 8,
897                                     sizeof(struct anv_bo *));
898    if (!success)
899       goto fail_batch_bo;
900 
901    *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
902 
903    success = u_vector_init(&cmd_buffer->bt_block_states, 8,
904                            sizeof(struct anv_state));
905    if (!success)
906       goto fail_seen_bbos;
907 
908    const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
909    result = anv_reloc_list_init(&cmd_buffer->surface_relocs,
910                                 &cmd_buffer->vk.pool->alloc, uses_relocs);
911    if (result != VK_SUCCESS)
912       goto fail_bt_blocks;
913 
914    return VK_SUCCESS;
915 
916  fail_bt_blocks:
917    u_vector_finish(&cmd_buffer->bt_block_states);
918  fail_seen_bbos:
919    u_vector_finish(&cmd_buffer->seen_bbos);
920  fail_batch_bo:
921    anv_batch_bo_destroy(batch_bo, cmd_buffer);
922 
923    return result;
924 }
925 
926 void
anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)927 anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
928 {
929    struct anv_state *bt_block;
930    u_vector_foreach(bt_block, &cmd_buffer->bt_block_states)
931       anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
932    u_vector_finish(&cmd_buffer->bt_block_states);
933 
934    anv_reloc_list_finish(&cmd_buffer->surface_relocs);
935 
936    u_vector_finish(&cmd_buffer->seen_bbos);
937 
938    /* Destroy all of the batch buffers */
939    list_for_each_entry_safe(struct anv_batch_bo, bbo,
940                             &cmd_buffer->batch_bos, link) {
941       list_del(&bbo->link);
942       anv_batch_bo_destroy(bbo, cmd_buffer);
943    }
944    /* Also destroy all generation batch buffers */
945    list_for_each_entry_safe(struct anv_batch_bo, bbo,
946                             &cmd_buffer->generation.batch_bos, link) {
947       list_del(&bbo->link);
948       anv_batch_bo_destroy(bbo, cmd_buffer);
949    }
950 
951    if (cmd_buffer->generation.ring_bo) {
952       anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
953                        cmd_buffer->generation.ring_bo);
954    }
955 }
956 
957 void
anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer * cmd_buffer)958 anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
959 {
960    /* Delete all but the first batch bo */
961    assert(!list_is_empty(&cmd_buffer->batch_bos));
962    while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) {
963       struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
964       list_del(&bbo->link);
965       anv_batch_bo_destroy(bbo, cmd_buffer);
966    }
967    assert(!list_is_empty(&cmd_buffer->batch_bos));
968 
969    anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer),
970                       &cmd_buffer->batch,
971                       GFX9_MI_BATCH_BUFFER_START_length * 4);
972 
973    while (u_vector_length(&cmd_buffer->bt_block_states) > 0) {
974       struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);
975       anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
976    }
977    cmd_buffer->bt_next = ANV_STATE_NULL;
978 
979    anv_reloc_list_clear(&cmd_buffer->surface_relocs);
980 
981    /* Reset the list of seen buffers */
982    cmd_buffer->seen_bbos.head = 0;
983    cmd_buffer->seen_bbos.tail = 0;
984 
985    struct anv_batch_bo *first_bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
986 
987    *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo;
988 
989    assert(first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);
990    cmd_buffer->batch.allocated_batch_size = first_bbo->bo->size;
991 
992    /* Delete all generation batch bos */
993    list_for_each_entry_safe(struct anv_batch_bo, bbo,
994                             &cmd_buffer->generation.batch_bos, link) {
995       list_del(&bbo->link);
996       anv_batch_bo_destroy(bbo, cmd_buffer);
997    }
998 
999    /* And reset generation batch */
1000    cmd_buffer->generation.batch.allocated_batch_size = 0;
1001    cmd_buffer->generation.batch.start = NULL;
1002    cmd_buffer->generation.batch.end   = NULL;
1003    cmd_buffer->generation.batch.next  = NULL;
1004 
1005    if (cmd_buffer->generation.ring_bo) {
1006       anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
1007                        cmd_buffer->generation.ring_bo);
1008       cmd_buffer->generation.ring_bo = NULL;
1009    }
1010 
1011    cmd_buffer->total_batch_size = 0;
1012 }
1013 
1014 void
anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer * cmd_buffer)1015 anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
1016 {
1017    const struct intel_device_info *devinfo = cmd_buffer->device->info;
1018    struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
1019 
1020    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1021       /* When we start a batch buffer, we subtract a certain amount of
1022        * padding from the end to ensure that we always have room to emit a
1023        * BATCH_BUFFER_START to chain to the next BO.  We need to remove
1024        * that padding before we end the batch; otherwise, we may end up
1025        * with our BATCH_BUFFER_END in another BO.
1026        */
1027       cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
1028       assert(cmd_buffer->batch.start == batch_bo->bo->map);
1029       assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
1030 
1031       /* Save end instruction location to override it later. */
1032       cmd_buffer->batch_end = cmd_buffer->batch.next;
1033 
1034       /* If we can chain this command buffer to another one, leave some place
1035        * for the jump instruction.
1036        */
1037       batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer);
1038       if (batch_bo->chained)
1039          emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
1040       else
1041          anv_batch_emit(&cmd_buffer->batch, GFX9_MI_BATCH_BUFFER_END, bbe);
1042 
1043       /* Round batch up to an even number of dwords. */
1044       if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)
1045          anv_batch_emit(&cmd_buffer->batch, GFX9_MI_NOOP, noop);
1046 
1047       cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
1048    } else {
1049       assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1050       /* If this is a secondary command buffer, we need to determine the
1051        * mode in which it will be executed with vkExecuteCommands.  We
1052        * determine this statically here so that this stays in sync with the
1053        * actual ExecuteCommands implementation.
1054        */
1055       const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start;
1056       if (cmd_buffer->device->physical->use_call_secondary) {
1057          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN;
1058 
1059          void *jump_addr =
1060             anv_genX(devinfo, batch_emit_return)(&cmd_buffer->batch) +
1061             (GFX9_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);
1062          cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr);
1063 
1064          /* The emit above may have caused us to chain batch buffers which
1065           * would mean that batch_bo is no longer valid.
1066           */
1067          batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
1068       } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) &&
1069                  (length < ANV_MIN_CMD_BUFFER_BATCH_SIZE / 2)) {
1070          /* If the secondary has exactly one batch buffer in its list *and*
1071           * that batch buffer is less than half of the maximum size, we're
1072           * probably better of simply copying it into our batch.
1073           */
1074          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT;
1075       } else if (!(cmd_buffer->usage_flags &
1076                    VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
1077          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN;
1078 
1079          /* In order to chain, we need this command buffer to contain an
1080           * MI_BATCH_BUFFER_START which will jump back to the calling batch.
1081           * It doesn't matter where it points now so long as has a valid
1082           * relocation.  We'll adjust it later as part of the chaining
1083           * process.
1084           *
1085           * We set the end of the batch a little short so we would be sure we
1086           * have room for the chaining command.  Since we're about to emit the
1087           * chaining command, let's set it back where it should go.
1088           */
1089          cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
1090          assert(cmd_buffer->batch.start == batch_bo->bo->map);
1091          assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
1092 
1093          emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
1094          assert(cmd_buffer->batch.start == batch_bo->bo->map);
1095       } else {
1096          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
1097       }
1098    }
1099 
1100    anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
1101 
1102    /* Add the current amount of data written in the current_bbo to the command
1103     * buffer.
1104     */
1105    cmd_buffer->total_batch_size += batch_bo->length;
1106 }
1107 
1108 static VkResult
anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer * cmd_buffer,struct list_head * list)1109 anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer,
1110                              struct list_head *list)
1111 {
1112    list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
1113       struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos);
1114       if (bbo_ptr == NULL)
1115          return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
1116 
1117       *bbo_ptr = bbo;
1118    }
1119 
1120    return VK_SUCCESS;
1121 }
1122 
1123 void
anv_cmd_buffer_add_secondary(struct anv_cmd_buffer * primary,struct anv_cmd_buffer * secondary)1124 anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
1125                              struct anv_cmd_buffer *secondary)
1126 {
1127    anv_measure_add_secondary(primary, secondary);
1128    switch (secondary->exec_mode) {
1129    case ANV_CMD_BUFFER_EXEC_MODE_EMIT:
1130       anv_batch_emit_batch(&primary->batch, &secondary->batch);
1131       break;
1132    case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: {
1133       struct anv_batch_bo *first_bbo =
1134          list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1135       struct anv_batch_bo *last_bbo =
1136          list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1137 
1138       emit_batch_buffer_start(&primary->batch, first_bbo->bo, 0);
1139 
1140       struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
1141       assert(primary->batch.start == this_bbo->bo->map);
1142       uint32_t offset = primary->batch.next - primary->batch.start;
1143 
1144       /* Make the tail of the secondary point back to right after the
1145        * MI_BATCH_BUFFER_START in the primary batch.
1146        */
1147       anv_batch_bo_link(primary, last_bbo, this_bbo, offset);
1148 
1149       anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
1150       break;
1151    }
1152    case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: {
1153       struct list_head copy_list;
1154       VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos,
1155                                                 secondary,
1156                                                 &copy_list);
1157       if (result != VK_SUCCESS)
1158          return; /* FIXME */
1159 
1160       anv_cmd_buffer_add_seen_bbos(primary, &copy_list);
1161 
1162       struct anv_batch_bo *first_bbo =
1163          list_first_entry(&copy_list, struct anv_batch_bo, link);
1164       struct anv_batch_bo *last_bbo =
1165          list_last_entry(&copy_list, struct anv_batch_bo, link);
1166 
1167       cmd_buffer_chain_to_batch_bo(primary, first_bbo,
1168                                    ANV_CMD_BUFFER_BATCH_MAIN);
1169 
1170       list_splicetail(&copy_list, &primary->batch_bos);
1171 
1172       anv_batch_bo_continue(last_bbo, &primary->batch,
1173                             GFX9_MI_BATCH_BUFFER_START_length * 4);
1174       break;
1175    }
1176    case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: {
1177       struct anv_batch_bo *first_bbo =
1178          list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
1179 
1180       anv_genX(primary->device->info, batch_emit_secondary_call)(
1181          &primary->batch,
1182          (struct anv_address) { .bo = first_bbo->bo },
1183          secondary->return_addr);
1184 
1185       anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
1186       break;
1187    }
1188    default:
1189       assert(!"Invalid execution mode");
1190    }
1191 
1192    anv_reloc_list_append(&primary->surface_relocs, &secondary->surface_relocs);
1193 
1194    /* Add the amount of data written in the secondary buffer to the primary
1195     * command buffer.
1196     */
1197    primary->total_batch_size += secondary->total_batch_size;
1198 }
1199 
1200 void
anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer ** cmd_buffers,uint32_t num_cmd_buffers)1201 anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
1202                                      uint32_t num_cmd_buffers)
1203 {
1204    if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
1205       assert(num_cmd_buffers == 1);
1206       return;
1207    }
1208 
1209    /* Chain the N-1 first batch buffers */
1210    for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++) {
1211       assert(cmd_buffers[i]->companion_rcs_cmd_buffer == NULL);
1212       anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
1213    }
1214 
1215    /* Put an end to the last one */
1216    anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
1217 }
1218 
1219 static void
anv_print_batch(struct anv_device * device,struct anv_queue * queue,struct anv_cmd_buffer * cmd_buffer)1220 anv_print_batch(struct anv_device *device,
1221                 struct anv_queue *queue,
1222                 struct anv_cmd_buffer *cmd_buffer)
1223 {
1224    struct anv_batch_bo *bbo =
1225       list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
1226    device->cmd_buffer_being_decoded = cmd_buffer;
1227    struct intel_batch_decode_ctx *ctx = queue->decoder;
1228 
1229    if (cmd_buffer->is_companion_rcs_cmd_buffer) {
1230       int render_queue_idx =
1231          anv_get_first_render_queue_index(device->physical);
1232       ctx = &device->decoder[render_queue_idx];
1233    }
1234 
1235    if (INTEL_DEBUG(DEBUG_BATCH)) {
1236       intel_print_batch(ctx, bbo->bo->map,
1237                         bbo->bo->size, bbo->bo->offset, false);
1238    }
1239    if (INTEL_DEBUG(DEBUG_BATCH_STATS)) {
1240       intel_batch_stats(ctx, bbo->bo->map,
1241                         bbo->bo->size, bbo->bo->offset, false);
1242    }
1243    device->cmd_buffer_being_decoded = NULL;
1244 }
1245 
1246 void
anv_cmd_buffer_exec_batch_debug(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass)1247 anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
1248                                 uint32_t cmd_buffer_count,
1249                                 struct anv_cmd_buffer **cmd_buffers,
1250                                 struct anv_query_pool *perf_query_pool,
1251                                 uint32_t perf_query_pass)
1252 {
1253    if (!INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS))
1254       return;
1255 
1256    struct anv_device *device = queue->device;
1257    const bool has_perf_query = perf_query_pool && perf_query_pass >= 0 &&
1258                                cmd_buffer_count;
1259    uint64_t frame_id = device->debug_frame_desc->frame_id;
1260 
1261    if (!intel_debug_batch_in_range(device->debug_frame_desc->frame_id))
1262       return;
1263    fprintf(stderr, "Batch for frame %"PRIu64" on queue %d\n",
1264       frame_id, (int)(queue - device->queues));
1265 
1266    if (cmd_buffer_count) {
1267       if (has_perf_query) {
1268          struct anv_bo *pass_batch_bo = perf_query_pool->bo;
1269          uint64_t pass_batch_offset =
1270             khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass);
1271 
1272          if (INTEL_DEBUG(DEBUG_BATCH)) {
1273             intel_print_batch(queue->decoder,
1274                               pass_batch_bo->map + pass_batch_offset, 64,
1275                               pass_batch_bo->offset + pass_batch_offset, false);
1276          }
1277       }
1278 
1279       for (uint32_t i = 0; i < cmd_buffer_count; i++)
1280          anv_print_batch(device, queue, cmd_buffers[i]);
1281    } else if (INTEL_DEBUG(DEBUG_BATCH)) {
1282       intel_print_batch(queue->decoder, device->trivial_batch_bo->map,
1283                         device->trivial_batch_bo->size,
1284                         device->trivial_batch_bo->offset, false);
1285    }
1286 }
1287 
1288 /* We lock around execbuf for three main reasons:
1289  *
1290  *  1) When a block pool is resized, we create a new gem handle with a
1291  *     different size and, in the case of surface states, possibly a different
1292  *     center offset but we re-use the same anv_bo struct when we do so. If
1293  *     this happens in the middle of setting up an execbuf, we could end up
1294  *     with our list of BOs out of sync with our list of gem handles.
1295  *
1296  *  2) The algorithm we use for building the list of unique buffers isn't
1297  *     thread-safe. While the client is supposed to synchronize around
1298  *     QueueSubmit, this would be extremely difficult to debug if it ever came
1299  *     up in the wild due to a broken app. It's better to play it safe and
1300  *     just lock around QueueSubmit.
1301  *
1302  * Since the only other things that ever take the device lock such as block
1303  * pool resize only rarely happen, this will almost never be contended so
1304  * taking a lock isn't really an expensive operation in this case.
1305  */
1306 static inline VkResult
anv_queue_exec_locked(struct anv_queue * queue,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t signal_count,const struct vk_sync_signal * signals,struct anv_query_pool * perf_query_pool,uint32_t perf_query_pass,struct anv_utrace_submit * utrace_submit)1307 anv_queue_exec_locked(struct anv_queue *queue,
1308                       uint32_t wait_count,
1309                       const struct vk_sync_wait *waits,
1310                       uint32_t cmd_buffer_count,
1311                       struct anv_cmd_buffer **cmd_buffers,
1312                       uint32_t signal_count,
1313                       const struct vk_sync_signal *signals,
1314                       struct anv_query_pool *perf_query_pool,
1315                       uint32_t perf_query_pass,
1316                       struct anv_utrace_submit *utrace_submit)
1317 {
1318    struct anv_device *device = queue->device;
1319    VkResult result = VK_SUCCESS;
1320 
1321    /* We only need to synchronize the main & companion command buffers if we
1322     * have a companion command buffer somewhere in the list of command
1323     * buffers.
1324     */
1325    bool needs_companion_sync = false;
1326    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
1327       if (cmd_buffers[i]->companion_rcs_cmd_buffer != NULL) {
1328          needs_companion_sync = true;
1329          break;
1330       }
1331    }
1332 
1333    result =
1334       device->kmd_backend->queue_exec_locked(
1335          queue,
1336          wait_count, waits,
1337          cmd_buffer_count, cmd_buffers,
1338          needs_companion_sync ? 0 : signal_count, signals,
1339          perf_query_pool,
1340          perf_query_pass,
1341          utrace_submit);
1342    if (result != VK_SUCCESS)
1343       return result;
1344 
1345    if (needs_companion_sync) {
1346       struct vk_sync_wait companion_sync = {
1347          .sync = queue->companion_sync,
1348       };
1349       /* If any of the command buffer had a companion batch, the submission
1350        * backend will signal queue->companion_sync, so to ensure completion,
1351        * we just need to wait on that fence.
1352        */
1353       result =
1354          device->kmd_backend->queue_exec_locked(queue,
1355                                                 1, &companion_sync,
1356                                                 0, NULL,
1357                                                 signal_count, signals,
1358                                                 NULL, 0,
1359                                                 NULL);
1360    }
1361 
1362    return result;
1363 }
1364 
1365 static inline bool
can_chain_query_pools(struct anv_query_pool * p1,struct anv_query_pool * p2)1366 can_chain_query_pools(struct anv_query_pool *p1, struct anv_query_pool *p2)
1367 {
1368    return (!p1 || !p2 || p1 == p2);
1369 }
1370 
1371 static VkResult
anv_queue_submit_sparse_bind_locked(struct anv_queue * queue,struct vk_queue_submit * submit)1372 anv_queue_submit_sparse_bind_locked(struct anv_queue *queue,
1373                                     struct vk_queue_submit *submit)
1374 {
1375    struct anv_device *device = queue->device;
1376    VkResult result;
1377 
1378    /* When fake sparse is enabled, while we do accept creating "sparse"
1379     * resources we can't really handle sparse submission. Fake sparse is
1380     * supposed to be used by applications that request sparse to be enabled
1381     * but don't actually *use* it.
1382     */
1383    if (!device->physical->has_sparse) {
1384       if (INTEL_DEBUG(DEBUG_SPARSE))
1385          fprintf(stderr, "=== application submitting sparse operations: "
1386                "buffer_bind:%d image_opaque_bind:%d image_bind:%d\n",
1387                submit->buffer_bind_count, submit->image_opaque_bind_count,
1388                submit->image_bind_count);
1389       return vk_queue_set_lost(&queue->vk, "Sparse binding not supported");
1390    }
1391 
1392    device->using_sparse = true;
1393 
1394    assert(submit->command_buffer_count == 0);
1395 
1396    if (INTEL_DEBUG(DEBUG_SPARSE)) {
1397       fprintf(stderr, "[sparse submission, buffers:%u opaque_images:%u "
1398               "images:%u waits:%u signals:%u]\n",
1399               submit->buffer_bind_count,
1400               submit->image_opaque_bind_count,
1401               submit->image_bind_count,
1402               submit->wait_count, submit->signal_count);
1403    }
1404 
1405    struct anv_sparse_submission sparse_submit = {
1406       .queue = queue,
1407       .binds = NULL,
1408       .binds_len = 0,
1409       .binds_capacity = 0,
1410       .wait_count = submit->wait_count,
1411       .signal_count = submit->signal_count,
1412       .waits = submit->waits,
1413       .signals = submit->signals,
1414    };
1415 
1416    for (uint32_t i = 0; i < submit->buffer_bind_count; i++) {
1417       VkSparseBufferMemoryBindInfo *bind_info = &submit->buffer_binds[i];
1418       ANV_FROM_HANDLE(anv_buffer, buffer, bind_info->buffer);
1419 
1420       assert(anv_buffer_is_sparse(buffer));
1421 
1422       for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1423          result = anv_sparse_bind_buffer(device, buffer,
1424                                          &bind_info->pBinds[j],
1425                                          &sparse_submit);
1426          if (result != VK_SUCCESS)
1427             goto out_free_submit;
1428       }
1429    }
1430 
1431    for (uint32_t i = 0; i < submit->image_bind_count; i++) {
1432       VkSparseImageMemoryBindInfo *bind_info = &submit->image_binds[i];
1433       ANV_FROM_HANDLE(anv_image, image, bind_info->image);
1434 
1435       assert(anv_image_is_sparse(image));
1436       assert(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT);
1437 
1438       for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1439          result = anv_sparse_bind_image_memory(queue, image,
1440                                                &bind_info->pBinds[j],
1441                                                &sparse_submit);
1442          if (result != VK_SUCCESS)
1443             goto out_free_submit;
1444       }
1445    }
1446 
1447    for (uint32_t i = 0; i < submit->image_opaque_bind_count; i++) {
1448       VkSparseImageOpaqueMemoryBindInfo *bind_info =
1449          &submit->image_opaque_binds[i];
1450       ANV_FROM_HANDLE(anv_image, image, bind_info->image);
1451 
1452       assert(anv_image_is_sparse(image));
1453 
1454       for (uint32_t j = 0; j < bind_info->bindCount; j++) {
1455          result = anv_sparse_bind_image_opaque(device, image,
1456                                                &bind_info->pBinds[j],
1457                                                &sparse_submit);
1458          if (result != VK_SUCCESS)
1459             goto out_free_submit;
1460       }
1461    }
1462 
1463    result = anv_sparse_bind(device, &sparse_submit);
1464 
1465 out_free_submit:
1466    vk_free(&device->vk.alloc, sparse_submit.binds);
1467    return result;
1468 }
1469 
1470 static VkResult
anv_queue_submit_cmd_buffers_locked(struct anv_queue * queue,struct vk_queue_submit * submit,struct anv_utrace_submit * utrace_submit)1471 anv_queue_submit_cmd_buffers_locked(struct anv_queue *queue,
1472                                     struct vk_queue_submit *submit,
1473                                     struct anv_utrace_submit *utrace_submit)
1474 {
1475    VkResult result;
1476 
1477    if (submit->command_buffer_count == 0) {
1478       result = anv_queue_exec_locked(queue, submit->wait_count, submit->waits,
1479                                      0 /* cmd_buffer_count */,
1480                                      NULL /* cmd_buffers */,
1481                                      submit->signal_count, submit->signals,
1482                                      NULL /* perf_query_pool */,
1483                                      0 /* perf_query_pass */,
1484                                      utrace_submit);
1485       if (result != VK_SUCCESS)
1486          return result;
1487    } else {
1488       /* Everything's easier if we don't have to bother with container_of() */
1489       STATIC_ASSERT(offsetof(struct anv_cmd_buffer, vk) == 0);
1490       struct vk_command_buffer **vk_cmd_buffers = submit->command_buffers;
1491       struct anv_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers;
1492       uint32_t start = 0;
1493       uint32_t end = submit->command_buffer_count;
1494       struct anv_query_pool *perf_query_pool =
1495          cmd_buffers[start]->perf_query_pool;
1496       for (uint32_t n = 0; n < end; n++) {
1497          bool can_chain = false;
1498          uint32_t next = n + 1;
1499          /* Can we chain the last buffer into the next one? */
1500          if (next < end &&
1501              anv_cmd_buffer_is_chainable(cmd_buffers[n]) &&
1502              anv_cmd_buffer_is_chainable(cmd_buffers[next]) &&
1503              can_chain_query_pools
1504              (cmd_buffers[next]->perf_query_pool, perf_query_pool)) {
1505             can_chain = true;
1506             perf_query_pool =
1507                perf_query_pool ? perf_query_pool :
1508                cmd_buffers[next]->perf_query_pool;
1509          }
1510          if (!can_chain) {
1511             /* The next buffer cannot be chained, or we have reached the
1512              * last buffer, submit what have been chained so far.
1513              */
1514             VkResult result =
1515                anv_queue_exec_locked(queue,
1516                                      start == 0 ? submit->wait_count : 0,
1517                                      start == 0 ? submit->waits : NULL,
1518                                      next - start, &cmd_buffers[start],
1519                                      next == end ? submit->signal_count : 0,
1520                                      next == end ? submit->signals : NULL,
1521                                      perf_query_pool,
1522                                      submit->perf_pass_index,
1523                                      next == end ? utrace_submit : NULL);
1524             if (result != VK_SUCCESS)
1525                return result;
1526             if (next < end) {
1527                start = next;
1528                perf_query_pool = cmd_buffers[start]->perf_query_pool;
1529             }
1530          }
1531       }
1532    }
1533    for (uint32_t i = 0; i < submit->signal_count; i++) {
1534       if (!vk_sync_is_anv_bo_sync(submit->signals[i].sync))
1535          continue;
1536 
1537       struct anv_bo_sync *bo_sync =
1538          container_of(submit->signals[i].sync, struct anv_bo_sync, sync);
1539 
1540       /* Once the execbuf has returned, we need to set the fence state to
1541        * SUBMITTED.  We can't do this before calling execbuf because
1542        * anv_GetFenceStatus does take the global device lock before checking
1543        * fence->state.
1544        *
1545        * We set the fence state to SUBMITTED regardless of whether or not the
1546        * execbuf succeeds because we need to ensure that vkWaitForFences() and
1547        * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or
1548        * VK_SUCCESS) in a finite amount of time even if execbuf fails.
1549        */
1550       assert(bo_sync->state == ANV_BO_SYNC_STATE_RESET);
1551       bo_sync->state = ANV_BO_SYNC_STATE_SUBMITTED;
1552    }
1553 
1554    pthread_cond_broadcast(&queue->device->queue_submit);
1555 
1556    return VK_SUCCESS;
1557 }
1558 
1559 VkResult
anv_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1560 anv_queue_submit(struct vk_queue *vk_queue,
1561                  struct vk_queue_submit *submit)
1562 {
1563    struct anv_queue *queue = container_of(vk_queue, struct anv_queue, vk);
1564    struct anv_device *device = queue->device;
1565    VkResult result;
1566 
1567    if (queue->device->info->no_hw) {
1568       for (uint32_t i = 0; i < submit->signal_count; i++) {
1569          result = vk_sync_signal(&device->vk,
1570                                  submit->signals[i].sync,
1571                                  submit->signals[i].signal_value);
1572          if (result != VK_SUCCESS)
1573             return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed");
1574       }
1575       return VK_SUCCESS;
1576    }
1577 
1578    /* Flush the trace points first before taking the lock as the flushing
1579     * might try to take that same lock.
1580     */
1581    struct anv_utrace_submit *utrace_submit = NULL;
1582    result = anv_device_utrace_flush_cmd_buffers(
1583       queue,
1584       submit->command_buffer_count,
1585       (struct anv_cmd_buffer **)submit->command_buffers,
1586       &utrace_submit);
1587    if (result != VK_SUCCESS)
1588       return result;
1589 
1590    pthread_mutex_lock(&device->mutex);
1591 
1592    uint64_t start_ts = intel_ds_begin_submit(&queue->ds);
1593 
1594    if (submit->buffer_bind_count ||
1595        submit->image_opaque_bind_count ||
1596        submit->image_bind_count) {
1597       result = anv_queue_submit_sparse_bind_locked(queue, submit);
1598    } else {
1599       result = anv_queue_submit_cmd_buffers_locked(queue, submit,
1600                                                    utrace_submit);
1601    }
1602 
1603    /* Take submission ID under lock */
1604    intel_ds_end_submit(&queue->ds, start_ts);
1605 
1606    pthread_mutex_unlock(&device->mutex);
1607 
1608    intel_ds_device_process(&device->ds, true);
1609 
1610    return result;
1611 }
1612 
1613 VkResult
anv_queue_submit_simple_batch(struct anv_queue * queue,struct anv_batch * batch,bool is_companion_rcs_batch)1614 anv_queue_submit_simple_batch(struct anv_queue *queue,
1615                               struct anv_batch *batch,
1616                               bool is_companion_rcs_batch)
1617 {
1618    struct anv_device *device = queue->device;
1619    VkResult result = VK_SUCCESS;
1620 
1621    if (anv_batch_has_error(batch))
1622       return batch->status;
1623 
1624    if (queue->device->info->no_hw)
1625       return VK_SUCCESS;
1626 
1627    /* This is only used by device init so we can assume the queue is empty and
1628     * we aren't fighting with a submit thread.
1629     */
1630    assert(vk_queue_is_empty(&queue->vk));
1631 
1632    uint32_t batch_size = align(batch->next - batch->start, 8);
1633 
1634    struct anv_bo *batch_bo = NULL;
1635    result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, &batch_bo);
1636    if (result != VK_SUCCESS)
1637       return result;
1638 
1639    memcpy(batch_bo->map, batch->start, batch_size);
1640 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
1641    if (device->physical->memory.need_flush &&
1642        anv_bo_needs_host_cache_flush(batch_bo->alloc_flags))
1643       intel_flush_range(batch_bo->map, batch_size);
1644 #endif
1645 
1646    if (INTEL_DEBUG(DEBUG_BATCH) &&
1647        intel_debug_batch_in_range(device->debug_frame_desc->frame_id)) {
1648       int render_queue_idx =
1649          anv_get_first_render_queue_index(device->physical);
1650       struct intel_batch_decode_ctx *ctx = is_companion_rcs_batch ?
1651                                            &device->decoder[render_queue_idx] :
1652                                            queue->decoder;
1653       intel_print_batch(ctx, batch_bo->map, batch_bo->size, batch_bo->offset,
1654                         false);
1655    }
1656 
1657    result = device->kmd_backend->execute_simple_batch(queue, batch_bo,
1658                                                       batch_size,
1659                                                       is_companion_rcs_batch);
1660 
1661    anv_bo_pool_free(&device->batch_bo_pool, batch_bo);
1662 
1663    return result;
1664 }
1665 
1666 VkResult
anv_queue_submit_trtt_batch(struct anv_sparse_submission * submit,struct anv_batch * batch)1667 anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
1668                             struct anv_batch *batch)
1669 {
1670    struct anv_queue *queue = submit->queue;
1671    struct anv_device *device = queue->device;
1672    VkResult result = VK_SUCCESS;
1673 
1674    uint32_t batch_size = align(batch->next - batch->start, 8);
1675    struct anv_trtt_batch_bo *trtt_bbo;
1676    result = anv_trtt_batch_bo_new(device, batch_size, &trtt_bbo);
1677    if (result != VK_SUCCESS)
1678       return result;
1679 
1680    memcpy(trtt_bbo->bo->map, batch->start, trtt_bbo->size);
1681 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
1682    if (device->physical->memory.need_flush &&
1683        anv_bo_needs_host_cache_flush(trtt_bbo->bo->alloc_flags))
1684       intel_flush_range(trtt_bbo->bo->map, trtt_bbo->size);
1685 #endif
1686 
1687    if (INTEL_DEBUG(DEBUG_BATCH)) {
1688       intel_print_batch(queue->decoder, trtt_bbo->bo->map, trtt_bbo->bo->size,
1689                         trtt_bbo->bo->offset, false);
1690    }
1691 
1692    result = device->kmd_backend->execute_trtt_batch(submit, trtt_bbo);
1693 
1694    return result;
1695 }
1696 
1697 void
anv_cmd_buffer_clflush(struct anv_cmd_buffer ** cmd_buffers,uint32_t num_cmd_buffers)1698 anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
1699                        uint32_t num_cmd_buffers)
1700 {
1701 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
1702    struct anv_batch_bo **bbo;
1703 
1704    __builtin_ia32_mfence();
1705 
1706    for (uint32_t i = 0; i < num_cmd_buffers; i++) {
1707       u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
1708          intel_flush_range_no_fence((*bbo)->bo->map, (*bbo)->length);
1709       }
1710    }
1711 
1712    __builtin_ia32_mfence();
1713 #endif
1714 }
1715