• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 #include "anv_internal_kernels.h"
26 
27 #include "ds/intel_tracepoints.h"
28 #include "genxml/gen9_pack.h"
29 #include "perf/intel_perf.h"
30 #include "util/perf/cpu_trace.h"
31 
32 #include "vulkan/runtime/vk_common_entrypoints.h"
33 
34 /** Timestamp structure format */
35 union anv_utrace_timestamp {
36    /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
37     * PIPE_CONTROL.
38     */
39    uint64_t timestamp;
40 
41    /* Timestamp written by COMPUTE_WALKER::PostSync
42     *
43     * Layout is described in PRMs.
44     * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
45     *
46     *    "The timestamp layout :
47     *        [0] = 32b Context Timestamp Start
48     *        [1] = 32b Global Timestamp Start
49     *        [2] = 32b Context Timestamp End
50     *        [3] = 32b Global Timestamp End"
51     */
52    uint32_t compute_walker[4];
53 };
54 
55 static uint32_t
command_buffers_count_utraces(struct anv_device * device,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t * utrace_copies)56 command_buffers_count_utraces(struct anv_device *device,
57                               uint32_t cmd_buffer_count,
58                               struct anv_cmd_buffer **cmd_buffers,
59                               uint32_t *utrace_copies)
60 {
61    if (!u_trace_should_process(&device->ds.trace_context))
62       return 0;
63 
64    uint32_t utraces = 0;
65    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
66       if (u_trace_has_points(&cmd_buffers[i]->trace)) {
67          utraces++;
68          if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
69             *utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks);
70       }
71    }
72 
73    return utraces;
74 }
75 
76 static void
anv_utrace_delete_submit(struct u_trace_context * utctx,void * submit_data)77 anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data)
78 {
79    struct anv_device *device =
80       container_of(utctx, struct anv_device, ds.trace_context);
81    struct anv_utrace_submit *submit = submit_data;
82 
83    intel_ds_flush_data_fini(&submit->ds);
84 
85    anv_state_stream_finish(&submit->dynamic_state_stream);
86    anv_state_stream_finish(&submit->general_state_stream);
87 
88    if (submit->trace_bo)
89       anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
90 
91    util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
92       anv_bo_pool_free(&device->utrace_bo_pool, *bo);
93    util_dynarray_fini(&submit->batch_bos);
94 
95    vk_sync_destroy(&device->vk, submit->sync);
96 
97    vk_free(&device->vk.alloc, submit);
98 }
99 
100 static void
anv_device_utrace_emit_gfx_copy_ts_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint32_t from_offset,void * ts_to,uint32_t to_offset,uint32_t count)101 anv_device_utrace_emit_gfx_copy_ts_buffer(struct u_trace_context *utctx,
102                                           void *cmdstream,
103                                           void *ts_from, uint32_t from_offset,
104                                           void *ts_to, uint32_t to_offset,
105                                           uint32_t count)
106 {
107    struct anv_device *device =
108       container_of(utctx, struct anv_device, ds.trace_context);
109    struct anv_utrace_submit *submit = cmdstream;
110    struct anv_address from_addr = (struct anv_address) {
111       .bo = ts_from, .offset = from_offset * sizeof(union anv_utrace_timestamp) };
112    struct anv_address to_addr = (struct anv_address) {
113       .bo = ts_to, .offset = to_offset * sizeof(union anv_utrace_timestamp) };
114 
115    anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
116                                           to_addr, from_addr,
117                                           count * sizeof(union anv_utrace_timestamp));
118 }
119 
120 static void
anv_device_utrace_emit_cs_copy_ts_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint32_t from_offset,void * ts_to,uint32_t to_offset,uint32_t count)121 anv_device_utrace_emit_cs_copy_ts_buffer(struct u_trace_context *utctx,
122                                          void *cmdstream,
123                                          void *ts_from, uint32_t from_offset,
124                                          void *ts_to, uint32_t to_offset,
125                                          uint32_t count)
126 {
127    struct anv_device *device =
128       container_of(utctx, struct anv_device, ds.trace_context);
129    struct anv_utrace_submit *submit = cmdstream;
130    struct anv_address from_addr = (struct anv_address) {
131       .bo = ts_from, .offset = from_offset * sizeof(union anv_utrace_timestamp) };
132    struct anv_address to_addr = (struct anv_address) {
133       .bo = ts_to, .offset = to_offset * sizeof(union anv_utrace_timestamp) };
134 
135    struct anv_state push_data_state =
136       anv_genX(device->info, simple_shader_alloc_push)(
137          &submit->simple_state, sizeof(struct anv_memcpy_params));
138    struct anv_memcpy_params *params = push_data_state.map;
139 
140    *params = (struct anv_memcpy_params) {
141       .num_dwords = count * sizeof(union anv_utrace_timestamp) / 4,
142       .src_addr   = anv_address_physical(from_addr),
143       .dst_addr   = anv_address_physical(to_addr),
144    };
145 
146    anv_genX(device->info, emit_simple_shader_dispatch)(
147       &submit->simple_state, DIV_ROUND_UP(params->num_dwords, 4),
148       push_data_state);
149 }
150 
151 static VkResult
anv_utrace_submit_extend_batch(struct anv_batch * batch,uint32_t size,void * user_data)152 anv_utrace_submit_extend_batch(struct anv_batch *batch, uint32_t size,
153                                void *user_data)
154 {
155    struct anv_utrace_submit *submit = user_data;
156 
157    uint32_t alloc_size = 0;
158    util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
159       alloc_size += (*bo)->size;
160    alloc_size = MAX2(alloc_size * 2, 8192);
161 
162    struct anv_bo *bo;
163    VkResult result = anv_bo_pool_alloc(&submit->queue->device->utrace_bo_pool,
164                                        align(alloc_size, 4096),
165                                        &bo);
166    if (result != VK_SUCCESS)
167       return result;
168 
169    util_dynarray_append(&submit->batch_bos, struct anv_bo *, bo);
170 
171    batch->end += 4 * GFX9_MI_BATCH_BUFFER_START_length;
172 
173    anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
174       bbs.DWordLength               = GFX9_MI_BATCH_BUFFER_START_length -
175                                       GFX9_MI_BATCH_BUFFER_START_length_bias;
176       bbs.SecondLevelBatchBuffer    = Firstlevelbatch;
177       bbs.AddressSpaceIndicator     = ASI_PPGTT;
178       bbs.BatchBufferStartAddress   = (struct anv_address) { bo, 0 };
179    }
180 
181    anv_batch_set_storage(batch,
182                          (struct anv_address) { .bo = bo, },
183                          bo->map,
184                          bo->size - 4 * GFX9_MI_BATCH_BUFFER_START_length);
185 
186    return VK_SUCCESS;
187 }
188 
189 VkResult
anv_device_utrace_flush_cmd_buffers(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_utrace_submit ** out_submit)190 anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
191                                     uint32_t cmd_buffer_count,
192                                     struct anv_cmd_buffer **cmd_buffers,
193                                     struct anv_utrace_submit **out_submit)
194 {
195    struct anv_device *device = queue->device;
196    uint32_t utrace_copies = 0;
197    uint32_t utraces = command_buffers_count_utraces(device,
198                                                     cmd_buffer_count,
199                                                     cmd_buffers,
200                                                     &utrace_copies);
201    if (!utraces) {
202       *out_submit = NULL;
203       return VK_SUCCESS;
204    }
205 
206    VkResult result;
207    struct anv_utrace_submit *submit =
208       vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
209                 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
210    if (!submit)
211       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
212 
213    submit->queue = queue;
214 
215    intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
216 
217    result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
218                            0, 0, &submit->sync);
219    if (result != VK_SUCCESS)
220       goto error_sync;
221 
222    util_dynarray_init(&submit->batch_bos, NULL);
223 
224    if (utrace_copies > 0) {
225       result = anv_bo_pool_alloc(&device->utrace_bo_pool,
226                                  utrace_copies * 4096,
227                                  &submit->trace_bo);
228       if (result != VK_SUCCESS)
229          goto error_trace_buf;
230 
231       const bool uses_relocs = device->physical->uses_relocs;
232       result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
233       if (result != VK_SUCCESS)
234          goto error_reloc_list;
235 
236       anv_state_stream_init(&submit->dynamic_state_stream,
237                             &device->dynamic_state_pool, 16384);
238       anv_state_stream_init(&submit->general_state_stream,
239                             &device->general_state_pool, 16384);
240 
241       submit->batch = (struct anv_batch) {
242          .alloc = &device->vk.alloc,
243          .relocs = &submit->relocs,
244          .user_data = submit,
245          .extend_cb = anv_utrace_submit_extend_batch,
246       };
247 
248       /* Only engine class where we support timestamp copies
249        *
250        * TODO: add INTEL_ENGINE_CLASS_COPY support (should be trivial ;)
251        */
252       assert(queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER ||
253              queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE);
254       if (queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER) {
255 
256          trace_intel_begin_trace_copy_cb(&submit->ds.trace, &submit->batch);
257 
258          anv_genX(device->info, emit_so_memcpy_init)(&submit->memcpy_state,
259                                                      device,
260                                                      &submit->batch);
261          uint32_t num_traces = 0;
262          for (uint32_t i = 0; i < cmd_buffer_count; i++) {
263             if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
264                intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
265                                          &submit->ds, false);
266             } else {
267                num_traces += cmd_buffers[i]->trace.num_traces;
268                u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
269                                     u_trace_end_iterator(&cmd_buffers[i]->trace),
270                                     &submit->ds.trace,
271                                     submit,
272                                     anv_device_utrace_emit_gfx_copy_ts_buffer);
273             }
274          }
275          anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
276 
277          trace_intel_end_trace_copy_cb(&submit->ds.trace, &submit->batch,
278                                        num_traces);
279 
280          anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
281       } else {
282          trace_intel_begin_trace_copy_cb(&submit->ds.trace, &submit->batch);
283 
284          submit->simple_state = (struct anv_simple_shader) {
285             .device               = device,
286             .dynamic_state_stream = &submit->dynamic_state_stream,
287             .general_state_stream = &submit->general_state_stream,
288             .batch                = &submit->batch,
289             .kernel               = device->internal_kernels[
290                ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE],
291             .l3_config            = device->internal_kernels_l3_config,
292          };
293          anv_genX(device->info, emit_simple_shader_init)(&submit->simple_state);
294 
295          uint32_t num_traces = 0;
296          for (uint32_t i = 0; i < cmd_buffer_count; i++) {
297             num_traces += cmd_buffers[i]->trace.num_traces;
298             if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
299                intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
300                                          &submit->ds, false);
301             } else {
302                num_traces += cmd_buffers[i]->trace.num_traces;
303                u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
304                                     u_trace_end_iterator(&cmd_buffers[i]->trace),
305                                     &submit->ds.trace,
306                                     submit,
307                                     anv_device_utrace_emit_cs_copy_ts_buffer);
308             }
309          }
310 
311          trace_intel_end_trace_copy_cb(&submit->ds.trace, &submit->batch,
312                                        num_traces);
313 
314          anv_genX(device->info, emit_simple_shader_end)(&submit->simple_state);
315       }
316 
317       intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds, true);
318 
319       if (submit->batch.status != VK_SUCCESS) {
320          result = submit->batch.status;
321          goto error_batch;
322       }
323    } else {
324       for (uint32_t i = 0; i < cmd_buffer_count; i++) {
325          assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
326          intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
327                                    &submit->ds, i == (cmd_buffer_count - 1));
328       }
329    }
330 
331    *out_submit = submit;
332 
333    return VK_SUCCESS;
334 
335  error_batch:
336    anv_reloc_list_finish(&submit->relocs);
337    util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
338       anv_bo_pool_free(&device->utrace_bo_pool, *bo);
339  error_reloc_list:
340    anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
341  error_trace_buf:
342    vk_sync_destroy(&device->vk, submit->sync);
343  error_sync:
344    intel_ds_flush_data_fini(&submit->ds);
345    vk_free(&device->vk.alloc, submit);
346    return result;
347 }
348 
349 static void *
anv_utrace_create_ts_buffer(struct u_trace_context * utctx,uint32_t size_b)350 anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
351 {
352    struct anv_device *device =
353       container_of(utctx, struct anv_device, ds.trace_context);
354 
355    uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) *
356       sizeof(union anv_utrace_timestamp);
357 
358    struct anv_bo *bo = NULL;
359    UNUSED VkResult result =
360       anv_bo_pool_alloc(&device->utrace_bo_pool,
361                         align(anv_ts_size_b, 4096),
362                         &bo);
363    assert(result == VK_SUCCESS);
364 
365    memset(bo->map, 0, bo->size);
366 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
367    if (device->physical->memory.need_flush &&
368        anv_bo_needs_host_cache_flush(bo->alloc_flags))
369       intel_flush_range(bo->map, bo->size);
370 #endif
371 
372    return bo;
373 }
374 
375 static void
anv_utrace_destroy_ts_buffer(struct u_trace_context * utctx,void * timestamps)376 anv_utrace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps)
377 {
378    struct anv_device *device =
379       container_of(utctx, struct anv_device, ds.trace_context);
380    struct anv_bo *bo = timestamps;
381 
382    anv_bo_pool_free(&device->utrace_bo_pool, bo);
383 }
384 
385 static void
anv_utrace_record_ts(struct u_trace * ut,void * cs,void * timestamps,unsigned idx,bool end_of_pipe)386 anv_utrace_record_ts(struct u_trace *ut, void *cs,
387                      void *timestamps, unsigned idx,
388                      bool end_of_pipe)
389 {
390    struct anv_device *device =
391       container_of(ut->utctx, struct anv_device, ds.trace_context);
392    struct anv_cmd_buffer *cmd_buffer =
393       container_of(ut, struct anv_cmd_buffer, trace);
394    /* cmd_buffer is only valid if cs == NULL */
395    struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
396    struct anv_bo *bo = timestamps;
397 
398    struct anv_address ts_address = (struct anv_address) {
399       .bo = bo,
400       .offset = idx * sizeof(union anv_utrace_timestamp)
401    };
402 
403    /* Is this a end of compute trace point? */
404    const bool is_end_compute =
405       cs == NULL &&
406       (cmd_buffer->last_compute_walker != NULL ||
407        cmd_buffer->last_indirect_dispatch != NULL) &&
408       end_of_pipe;
409 
410    enum anv_timestamp_capture_type capture_type = end_of_pipe ?
411       (is_end_compute ?
412        (cmd_buffer->last_indirect_dispatch != NULL ?
413         ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH : ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER) :
414        ANV_TIMESTAMP_CAPTURE_END_OF_PIPE) : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
415 
416    void *addr = capture_type ==  ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH ?
417                 cmd_buffer->last_indirect_dispatch :
418                 capture_type ==  ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER ?
419                 cmd_buffer->last_compute_walker : NULL;
420 
421    device->physical->cmd_emit_timestamp(batch, device, ts_address,
422                                         capture_type,
423                                         addr);
424    if (is_end_compute) {
425       cmd_buffer->last_compute_walker = NULL;
426       cmd_buffer->last_indirect_dispatch = NULL;
427    }
428 }
429 
430 static uint64_t
anv_utrace_read_ts(struct u_trace_context * utctx,void * timestamps,unsigned idx,void * flush_data)431 anv_utrace_read_ts(struct u_trace_context *utctx,
432                    void *timestamps, unsigned idx, void *flush_data)
433 {
434    struct anv_device *device =
435       container_of(utctx, struct anv_device, ds.trace_context);
436    struct anv_bo *bo = timestamps;
437    struct anv_utrace_submit *submit = flush_data;
438 
439    /* Only need to stall on results for the first entry: */
440    if (idx == 0) {
441       MESA_TRACE_SCOPE("anv utrace wait timestamps");
442       UNUSED VkResult result =
443          vk_sync_wait(&device->vk,
444                       submit->sync,
445                       0,
446                       VK_SYNC_WAIT_COMPLETE,
447                       os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
448       assert(result == VK_SUCCESS);
449    }
450 
451    union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map;
452 
453    /* Don't translate the no-timestamp marker: */
454    if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
455       return U_TRACE_NO_TIMESTAMP;
456 
457    /* Detect a 16bytes timestamp write */
458    if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
459       /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
460        * need to rebuild the full 64bits using the previous timestamp. We
461        * assume that utrace is reading the timestamp in order. Anyway
462        * timestamp rollover on 32bits in a few minutes so in most cases that
463        * should be correct.
464        */
465       uint64_t timestamp =
466          (submit->last_full_timestamp & 0xffffffff00000000) |
467          (uint64_t) ts[idx].compute_walker[3];
468 
469       return intel_device_info_timebase_scale(device->info, timestamp);
470    }
471 
472    submit->last_full_timestamp = ts[idx].timestamp;
473 
474    return intel_device_info_timebase_scale(device->info, ts[idx].timestamp);
475 }
476 
477 void
anv_device_utrace_init(struct anv_device * device)478 anv_device_utrace_init(struct anv_device *device)
479 {
480    anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace",
481                     ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_HOST_CACHED_COHERENT);
482    intel_ds_device_init(&device->ds, device->info, device->fd,
483                         device->physical->local_minor,
484                         INTEL_DS_API_VULKAN);
485    u_trace_context_init(&device->ds.trace_context,
486                         &device->ds,
487                         anv_utrace_create_ts_buffer,
488                         anv_utrace_destroy_ts_buffer,
489                         anv_utrace_record_ts,
490                         anv_utrace_read_ts,
491                         anv_utrace_delete_submit);
492 
493    for (uint32_t q = 0; q < device->queue_count; q++) {
494       struct anv_queue *queue = &device->queues[q];
495 
496       intel_ds_device_init_queue(&device->ds, &queue->ds, "%s%u",
497                                  intel_engines_class_to_string(queue->family->engine_class),
498                                  queue->vk.index_in_family);
499    }
500 }
501 
502 void
anv_device_utrace_finish(struct anv_device * device)503 anv_device_utrace_finish(struct anv_device *device)
504 {
505    intel_ds_device_process(&device->ds, true);
506    intel_ds_device_fini(&device->ds);
507    anv_bo_pool_finish(&device->utrace_bo_pool);
508 }
509 
510 enum intel_ds_stall_flag
anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)511 anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
512 {
513    static const struct {
514       enum anv_pipe_bits anv;
515       enum intel_ds_stall_flag ds;
516    } anv_to_ds_flags[] = {
517       { .anv = ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,            .ds = INTEL_DS_DEPTH_CACHE_FLUSH_BIT, },
518       { .anv = ANV_PIPE_DATA_CACHE_FLUSH_BIT,             .ds = INTEL_DS_DATA_CACHE_FLUSH_BIT, },
519       { .anv = ANV_PIPE_TILE_CACHE_FLUSH_BIT,             .ds = INTEL_DS_TILE_CACHE_FLUSH_BIT, },
520       { .anv = ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,    .ds = INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT, },
521       { .anv = ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,       .ds = INTEL_DS_STATE_CACHE_INVALIDATE_BIT, },
522       { .anv = ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,    .ds = INTEL_DS_CONST_CACHE_INVALIDATE_BIT, },
523       { .anv = ANV_PIPE_VF_CACHE_INVALIDATE_BIT,          .ds = INTEL_DS_VF_CACHE_INVALIDATE_BIT, },
524       { .anv = ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,     .ds = INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT, },
525       { .anv = ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_INST_CACHE_INVALIDATE_BIT, },
526       { .anv = ANV_PIPE_DEPTH_STALL_BIT,                  .ds = INTEL_DS_DEPTH_STALL_BIT, },
527       { .anv = ANV_PIPE_CS_STALL_BIT,                     .ds = INTEL_DS_CS_STALL_BIT, },
528       { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT,           .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, },
529       { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT,          .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, },
530       { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, },
531       { .anv = ANV_PIPE_PSS_STALL_SYNC_BIT,               .ds = INTEL_DS_PSS_STALL_SYNC_BIT, },
532       { .anv = ANV_PIPE_END_OF_PIPE_SYNC_BIT,             .ds = INTEL_DS_END_OF_PIPE_BIT, },
533       { .anv = ANV_PIPE_CCS_CACHE_FLUSH_BIT,              .ds = INTEL_DS_CCS_CACHE_FLUSH_BIT, },
534    };
535 
536    enum intel_ds_stall_flag ret = 0;
537    for (uint32_t i = 0; i < ARRAY_SIZE(anv_to_ds_flags); i++) {
538       if (anv_to_ds_flags[i].anv & bits)
539          ret |= anv_to_ds_flags[i].ds;
540    }
541 
542    return ret;
543 }
544 
anv_CmdBeginDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer,const VkDebugUtilsLabelEXT * pLabelInfo)545 void anv_CmdBeginDebugUtilsLabelEXT(
546    VkCommandBuffer _commandBuffer,
547    const VkDebugUtilsLabelEXT *pLabelInfo)
548 {
549    VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
550 
551    vk_common_CmdBeginDebugUtilsLabelEXT(_commandBuffer, pLabelInfo);
552 
553    trace_intel_begin_cmd_buffer_annotation(&cmd_buffer->trace);
554 }
555 
anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)556 void anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)
557 {
558    VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
559 
560    if (cmd_buffer->vk.labels.size > 0) {
561       const VkDebugUtilsLabelEXT *label =
562          util_dynarray_top_ptr(&cmd_buffer->vk.labels, VkDebugUtilsLabelEXT);
563 
564       trace_intel_end_cmd_buffer_annotation(&cmd_buffer->trace,
565                                             strlen(label->pLabelName),
566                                             label->pLabelName);
567    }
568 
569    vk_common_CmdEndDebugUtilsLabelEXT(_commandBuffer);
570 }
571 
572 void
anv_queue_trace(struct anv_queue * queue,const char * label,bool frame,bool begin)573 anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool begin)
574 {
575    struct anv_device *device = queue->device;
576 
577    VkResult result;
578    struct anv_utrace_submit *submit =
579       vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
580                 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
581    if (!submit)
582       return;
583 
584    submit->queue = queue;
585 
586    intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
587 
588    result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
589                            0, 0, &submit->sync);
590    if (result != VK_SUCCESS)
591       goto error_trace;
592 
593    const bool uses_relocs = device->physical->uses_relocs;
594    result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
595    if (result != VK_SUCCESS)
596       goto error_sync;
597 
598    submit->batch = (struct anv_batch) {
599       .alloc = &device->vk.alloc,
600       .relocs = &submit->relocs,
601       .user_data = submit,
602       .extend_cb = anv_utrace_submit_extend_batch,
603    };
604 
605    if (frame) {
606       if (begin)
607          trace_intel_begin_frame(&submit->ds.trace, &submit->batch);
608       else
609          trace_intel_end_frame(&submit->ds.trace, &submit->batch,
610                                device->debug_frame_desc->frame_id);
611    } else {
612       if (begin) {
613          trace_intel_begin_queue_annotation(&submit->ds.trace, &submit->batch);
614       } else {
615          trace_intel_end_queue_annotation(&submit->ds.trace,
616                                           &submit->batch,
617                                           strlen(label),
618                                           label);
619       }
620    }
621 
622    anv_batch_emit(&submit->batch, GFX9_MI_BATCH_BUFFER_END, bbs);
623    anv_batch_emit(&submit->batch, GFX9_MI_NOOP, noop);
624 
625    if (submit->batch.status != VK_SUCCESS) {
626       result = submit->batch.status;
627       goto error_reloc_list;
628    }
629 
630    intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds, true);
631 
632    pthread_mutex_lock(&device->mutex);
633    device->kmd_backend->queue_exec_trace(queue, submit);
634    pthread_mutex_unlock(&device->mutex);
635 
636    return;
637 
638  error_reloc_list:
639    anv_reloc_list_finish(&submit->relocs);
640    util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
641       anv_bo_pool_free(&device->utrace_bo_pool, *bo);
642  error_sync:
643    vk_sync_destroy(&device->vk, submit->sync);
644  error_trace:
645    intel_ds_flush_data_fini(&submit->ds);
646    vk_free(&device->vk.alloc, submit);
647 }
648 
649 void
anv_QueueBeginDebugUtilsLabelEXT(VkQueue _queue,const VkDebugUtilsLabelEXT * pLabelInfo)650 anv_QueueBeginDebugUtilsLabelEXT(
651    VkQueue _queue,
652    const VkDebugUtilsLabelEXT *pLabelInfo)
653 {
654    VK_FROM_HANDLE(anv_queue, queue, _queue);
655 
656    vk_common_QueueBeginDebugUtilsLabelEXT(_queue, pLabelInfo);
657 
658    anv_queue_trace(queue, pLabelInfo->pLabelName,
659                    false /* frame */, true /* begin */);
660 }
661 
662 void
anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)663 anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)
664 {
665    VK_FROM_HANDLE(anv_queue, queue, _queue);
666 
667    if (queue->vk.labels.size > 0) {
668       const VkDebugUtilsLabelEXT *label =
669          util_dynarray_top_ptr(&queue->vk.labels, VkDebugUtilsLabelEXT);
670       anv_queue_trace(queue, label->pLabelName,
671                       false /* frame */, false /* begin */);
672 
673       intel_ds_device_process(&queue->device->ds, true);
674    }
675 
676    vk_common_QueueEndDebugUtilsLabelEXT(_queue);
677 }
678