• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 #include "anv_internal_kernels.h"
26 
27 #include "common/intel_debug_identifier.h"
28 #include "ds/intel_tracepoints.h"
29 #include "genxml/gen90_pack.h"
30 #include "perf/intel_perf.h"
31 #include "util/perf/cpu_trace.h"
32 
33 #include "vk_common_entrypoints.h"
34 
35 /** Timestamp structure format */
36 union anv_utrace_timestamp {
37    /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
38     * PIPE_CONTROL.
39     */
40    uint64_t timestamp;
41 
42    /* Timestamp written by COMPUTE_WALKER::PostSync
43     *
44     * Layout is described in PRMs.
45     * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
46     *
47     *    "The timestamp layout :
48     *        [0] = 32b Context Timestamp Start
49     *        [1] = 32b Global Timestamp Start
50     *        [2] = 32b Context Timestamp End
51     *        [3] = 32b Global Timestamp End"
52     */
53    uint32_t gfx125_postsync_data[4];
54 
55    /* Timestamp written by COMPUTE_WALKER::PostSync
56     *
57     * BSpec 56591:
58     *
59     *    "The timestamp layout :
60     *       [0] = 64b Context Timestamp Start
61     *       [1] = 64b Global Timestamp Start
62     *       [2] = 64b Context Timestamp End
63     *       [3] = 64b Global Timestamp End"
64     */
65    uint64_t gfx20_postsync_data[4];
66 };
67 
68 static uint32_t
command_buffers_count_utraces(struct anv_device * device,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t * utrace_copies)69 command_buffers_count_utraces(struct anv_device *device,
70                               uint32_t cmd_buffer_count,
71                               struct anv_cmd_buffer **cmd_buffers,
72                               uint32_t *utrace_copies)
73 {
74    if (!u_trace_should_process(&device->ds.trace_context))
75       return 0;
76 
77    uint32_t utraces = 0;
78    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
79       if (u_trace_has_points(&cmd_buffers[i]->trace)) {
80          utraces++;
81          if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
82             *utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks);
83       }
84    }
85 
86    return utraces;
87 }
88 
89 static void
anv_utrace_delete_submit(struct u_trace_context * utctx,void * submit_data)90 anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data)
91 {
92    struct anv_device *device =
93       container_of(utctx, struct anv_device, ds.trace_context);
94    struct anv_utrace_submit *submit =
95       container_of(submit_data, struct anv_utrace_submit, ds);
96 
97    intel_ds_flush_data_fini(&submit->ds);
98 
99    anv_state_stream_finish(&submit->dynamic_state_stream);
100    anv_state_stream_finish(&submit->general_state_stream);
101 
102    anv_async_submit_fini(&submit->base);
103 
104    vk_free(&device->vk.alloc, submit);
105 }
106 
107 void
anv_device_utrace_emit_gfx_copy_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint64_t from_offset_B,void * ts_to,uint64_t to_offset_B,uint64_t size_B)108 anv_device_utrace_emit_gfx_copy_buffer(struct u_trace_context *utctx,
109                                        void *cmdstream,
110                                        void *ts_from, uint64_t from_offset_B,
111                                        void *ts_to, uint64_t to_offset_B,
112                                        uint64_t size_B)
113 {
114    struct anv_device *device =
115       container_of(utctx, struct anv_device, ds.trace_context);
116    struct anv_memcpy_state *memcpy_state = cmdstream;
117    struct anv_address from_addr = (struct anv_address) {
118       .bo = ts_from, .offset = from_offset_B };
119    struct anv_address to_addr = (struct anv_address) {
120       .bo = ts_to, .offset = to_offset_B };
121 
122    anv_genX(device->info, emit_so_memcpy)(memcpy_state,
123                                           to_addr, from_addr, size_B);
124 }
125 
126 static void
anv_device_utrace_emit_cs_copy_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint64_t from_offset_B,void * ts_to,uint64_t to_offset_B,uint64_t size_B)127 anv_device_utrace_emit_cs_copy_buffer(struct u_trace_context *utctx,
128                                       void *cmdstream,
129                                       void *ts_from, uint64_t from_offset_B,
130                                       void *ts_to, uint64_t to_offset_B,
131                                       uint64_t size_B)
132 {
133    struct anv_device *device =
134       container_of(utctx, struct anv_device, ds.trace_context);
135    struct anv_simple_shader *simple_state = cmdstream;
136    struct anv_address from_addr = (struct anv_address) {
137       .bo = ts_from, .offset = from_offset_B };
138    struct anv_address to_addr = (struct anv_address) {
139       .bo = ts_to, .offset = to_offset_B };
140 
141    struct anv_state push_data_state =
142       anv_genX(device->info, simple_shader_alloc_push)(
143          simple_state, sizeof(struct anv_memcpy_params));
144    struct anv_memcpy_params *params = push_data_state.map;
145 
146    *params = (struct anv_memcpy_params) {
147       .num_dwords = size_B / 4,
148       .src_addr   = anv_address_physical(from_addr),
149       .dst_addr   = anv_address_physical(to_addr),
150    };
151 
152    anv_genX(device->info, emit_simple_shader_dispatch)(
153       simple_state, DIV_ROUND_UP(params->num_dwords, 4),
154       push_data_state);
155 }
156 
157 VkResult
anv_device_utrace_flush_cmd_buffers(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_utrace_submit ** out_submit)158 anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
159                                     uint32_t cmd_buffer_count,
160                                     struct anv_cmd_buffer **cmd_buffers,
161                                     struct anv_utrace_submit **out_submit)
162 {
163    struct anv_device *device = queue->device;
164    uint32_t utrace_copies = 0;
165    uint32_t utraces = command_buffers_count_utraces(device,
166                                                     cmd_buffer_count,
167                                                     cmd_buffers,
168                                                     &utrace_copies);
169    if (!utraces) {
170       *out_submit = NULL;
171       return VK_SUCCESS;
172    }
173 
174    VkResult result;
175    struct anv_utrace_submit *submit =
176       vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
177                 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
178    if (!submit)
179       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
180 
181    result = anv_async_submit_init(&submit->base, queue,
182                                   &device->batch_bo_pool,
183                                   false, true);
184    if (result != VK_SUCCESS)
185       goto error_async;
186 
187    intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
188 
189    struct anv_batch *batch = &submit->base.batch;
190    if (utrace_copies > 0) {
191       anv_state_stream_init(&submit->dynamic_state_stream,
192                             &device->dynamic_state_pool, 16384);
193       anv_state_stream_init(&submit->general_state_stream,
194                             &device->general_state_pool, 16384);
195 
196       /* Only engine class where we support timestamp copies
197        *
198        * TODO: add INTEL_ENGINE_CLASS_COPY support (should be trivial ;)
199        */
200       assert(queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER ||
201              queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE);
202       if (queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER) {
203 
204          trace_intel_begin_trace_copy_cb(&submit->ds.trace, batch);
205 
206          anv_genX(device->info, emit_so_memcpy_init)(&submit->memcpy_state,
207                                                      device, NULL, batch);
208          uint32_t num_traces = 0;
209          for (uint32_t i = 0; i < cmd_buffer_count; i++) {
210             if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
211                intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
212                                          &submit->ds, device->vk.current_frame, false);
213             } else {
214                num_traces += cmd_buffers[i]->trace.num_traces;
215                u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
216                                     u_trace_end_iterator(&cmd_buffers[i]->trace),
217                                     &submit->ds.trace,
218                                     &submit->memcpy_state,
219                                     anv_device_utrace_emit_gfx_copy_buffer);
220             }
221          }
222          anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
223 
224          trace_intel_end_trace_copy_cb(&submit->ds.trace, batch, num_traces);
225 
226          anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
227       } else {
228          struct anv_shader_bin *copy_kernel;
229          VkResult ret =
230             anv_device_get_internal_shader(device,
231                                            ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE,
232                                            &copy_kernel);
233          if (ret != VK_SUCCESS)
234             goto error_sync;
235 
236          trace_intel_begin_trace_copy_cb(&submit->ds.trace, batch);
237 
238          submit->simple_state = (struct anv_simple_shader) {
239             .device               = device,
240             .dynamic_state_stream = &submit->dynamic_state_stream,
241             .general_state_stream = &submit->general_state_stream,
242             .batch                = batch,
243             .kernel               = copy_kernel,
244             .l3_config            = device->internal_kernels_l3_config,
245          };
246          anv_genX(device->info, emit_simple_shader_init)(&submit->simple_state);
247 
248          uint32_t num_traces = 0;
249          for (uint32_t i = 0; i < cmd_buffer_count; i++) {
250             num_traces += cmd_buffers[i]->trace.num_traces;
251             if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
252                intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
253                                          &submit->ds, device->vk.current_frame, false);
254             } else {
255                num_traces += cmd_buffers[i]->trace.num_traces;
256                u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
257                                     u_trace_end_iterator(&cmd_buffers[i]->trace),
258                                     &submit->ds.trace,
259                                     &submit->simple_state,
260                                     anv_device_utrace_emit_cs_copy_buffer);
261             }
262          }
263 
264          trace_intel_end_trace_copy_cb(&submit->ds.trace, batch, num_traces);
265 
266          anv_genX(device->info, emit_simple_shader_end)(&submit->simple_state);
267       }
268 
269 
270       if (batch->status != VK_SUCCESS) {
271          result = batch->status;
272          goto error_sync;
273       }
274 
275       intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds,
276                                 device->vk.current_frame, true);
277    } else {
278       for (uint32_t i = 0; i < cmd_buffer_count; i++) {
279          assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
280          intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
281                                    &submit->ds, device->vk.current_frame,
282                                    i == (cmd_buffer_count - 1));
283       }
284    }
285 
286    *out_submit = submit;
287 
288    return VK_SUCCESS;
289 
290  error_sync:
291    intel_ds_flush_data_fini(&submit->ds);
292    anv_async_submit_fini(&submit->base);
293  error_async:
294    vk_free(&device->vk.alloc, submit);
295    return result;
296 }
297 
298 static void *
anv_utrace_create_buffer(struct u_trace_context * utctx,uint64_t size_B)299 anv_utrace_create_buffer(struct u_trace_context *utctx, uint64_t size_B)
300 {
301    struct anv_device *device =
302       container_of(utctx, struct anv_device, ds.trace_context);
303 
304    struct anv_bo *bo = NULL;
305    UNUSED VkResult result =
306       anv_bo_pool_alloc(&device->utrace_bo_pool,
307                         align(size_B, 4096),
308                         &bo);
309    assert(result == VK_SUCCESS);
310 
311    memset(bo->map, 0, bo->size);
312 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
313    if (device->physical->memory.need_flush &&
314        anv_bo_needs_host_cache_flush(bo->alloc_flags))
315       intel_flush_range(bo->map, bo->size);
316 #endif
317 
318    return bo;
319 }
320 
321 static void
anv_utrace_destroy_buffer(struct u_trace_context * utctx,void * timestamps)322 anv_utrace_destroy_buffer(struct u_trace_context *utctx, void *timestamps)
323 {
324    struct anv_device *device =
325       container_of(utctx, struct anv_device, ds.trace_context);
326    struct anv_bo *bo = timestamps;
327 
328    anv_bo_pool_free(&device->utrace_bo_pool, bo);
329 }
330 
331 static void
anv_utrace_record_ts(struct u_trace * ut,void * cs,void * timestamps,uint64_t offset_B,uint32_t flags)332 anv_utrace_record_ts(struct u_trace *ut, void *cs,
333                      void *timestamps, uint64_t offset_B,
334                      uint32_t flags)
335 {
336    struct anv_device *device =
337       container_of(ut->utctx, struct anv_device, ds.trace_context);
338    struct anv_cmd_buffer *cmd_buffer =
339       container_of(ut, struct anv_cmd_buffer, trace);
340    /* cmd_buffer is only valid if cs == NULL */
341    struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
342    struct anv_bo *bo = timestamps;
343 
344    assert(offset_B % sizeof(union anv_utrace_timestamp) == 0);
345    struct anv_address ts_address = (struct anv_address) {
346       .bo = bo,
347       .offset = offset_B,
348    };
349 
350    /* Is this a end of compute trace point? */
351    const bool is_end_compute =
352       cs == NULL &&
353       (flags & INTEL_DS_TRACEPOINT_FLAG_END_CS);
354    const bool is_end_compute_or_noop =
355       cs == NULL &&
356       (flags & INTEL_DS_TRACEPOINT_FLAG_END_CS_OR_NOOP);
357    enum anv_timestamp_capture_type capture_type;
358    if (is_end_compute) {
359       assert(device->info->verx10 < 125 ||
360              !is_end_compute ||
361              cmd_buffer->state.last_indirect_dispatch != NULL ||
362              cmd_buffer->state.last_compute_walker != NULL);
363       capture_type =
364          device->info->verx10 >= 125 ?
365          (cmd_buffer->state.last_indirect_dispatch != NULL ?
366           ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH :
367           ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER) :
368           ANV_TIMESTAMP_CAPTURE_END_OF_PIPE;
369    } else if (is_end_compute_or_noop) {
370       capture_type =
371          device->info->verx10 >= 125 ?
372          (cmd_buffer->state.last_indirect_dispatch != NULL ?
373           ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH :
374           (cmd_buffer->state.last_compute_walker != NULL ?
375            ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER :
376            ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE)) :
377          ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
378    } else {
379       capture_type = (flags & INTEL_DS_TRACEPOINT_FLAG_END_CS) ?
380          ANV_TIMESTAMP_CAPTURE_END_OF_PIPE :
381          ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
382    }
383 
384    void *addr = capture_type ==  ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH ?
385                 cmd_buffer->state.last_indirect_dispatch :
386                 capture_type ==  ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER ?
387                 cmd_buffer->state.last_compute_walker : NULL;
388 
389    device->physical->cmd_emit_timestamp(batch, device, ts_address,
390                                         capture_type,
391                                         addr);
392    if (is_end_compute) {
393       cmd_buffer->state.last_compute_walker = NULL;
394       cmd_buffer->state.last_indirect_dispatch = NULL;
395    }
396 }
397 
398 static uint64_t
anv_utrace_read_ts(struct u_trace_context * utctx,void * timestamps,uint64_t offset_B,void * flush_data)399 anv_utrace_read_ts(struct u_trace_context *utctx,
400                    void *timestamps, uint64_t offset_B,
401                    void *flush_data)
402 {
403    struct anv_device *device =
404       container_of(utctx, struct anv_device, ds.trace_context);
405    struct anv_bo *bo = timestamps;
406    struct anv_utrace_submit *submit =
407       container_of(flush_data, struct anv_utrace_submit, ds);
408 
409    /* Only need to stall on results for the first entry: */
410    if (offset_B == 0) {
411       MESA_TRACE_SCOPE("anv utrace wait timestamps");
412       UNUSED VkResult result =
413          vk_sync_wait(&device->vk,
414                       submit->base.signal.sync,
415                       submit->base.signal.signal_value,
416                       VK_SYNC_WAIT_COMPLETE,
417                       os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
418       assert(result == VK_SUCCESS);
419    }
420 
421    assert(offset_B % sizeof(union anv_utrace_timestamp) == 0);
422    union anv_utrace_timestamp *ts =
423       (union anv_utrace_timestamp *)(bo->map + offset_B);
424 
425    /* Don't translate the no-timestamp marker: */
426    if (ts->timestamp == U_TRACE_NO_TIMESTAMP)
427       return U_TRACE_NO_TIMESTAMP;
428 
429    /* Detect a 16/32 bytes timestamp write */
430    if (ts->gfx20_postsync_data[1] != 0 ||
431        ts->gfx20_postsync_data[2] != 0 ||
432        ts->gfx20_postsync_data[3] != 0) {
433       if (device->info->ver >= 20) {
434          return intel_device_info_timebase_scale(device->info,
435                                                  ts->gfx20_postsync_data[3]);
436       }
437 
438       /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
439        * need to rebuild the full 64bits using the previous timestamp. We
440        * assume that utrace is reading the timestamp in order. Anyway
441        * timestamp rollover on 32bits in a few minutes so in most cases that
442        * should be correct.
443        */
444       uint64_t timestamp =
445          (submit->last_full_timestamp & 0xffffffff00000000) |
446          (uint64_t) ts->gfx125_postsync_data[3];
447 
448       return intel_device_info_timebase_scale(device->info, timestamp);
449    }
450 
451    submit->last_full_timestamp = ts->timestamp;
452 
453    return intel_device_info_timebase_scale(device->info, ts->timestamp);
454 }
455 
456 static void
anv_utrace_capture_data(struct u_trace * ut,void * cs,void * dst_buffer,uint64_t dst_offset_B,void * src_buffer,uint64_t src_offset_B,uint32_t size_B)457 anv_utrace_capture_data(struct u_trace *ut,
458                         void *cs,
459                         void *dst_buffer,
460                         uint64_t dst_offset_B,
461                         void *src_buffer,
462                         uint64_t src_offset_B,
463                         uint32_t size_B)
464 {
465    struct anv_device *device =
466       container_of(ut->utctx, struct anv_device, ds.trace_context);
467    struct anv_cmd_buffer *cmd_buffer =
468       container_of(ut, struct anv_cmd_buffer, trace);
469    /* cmd_buffer is only valid if cs == NULL */
470    struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
471    struct anv_address dst_addr = {
472       .bo = dst_buffer,
473       .offset = dst_offset_B,
474    };
475    struct anv_address src_addr = {
476       .bo = src_buffer,
477       .offset = src_offset_B,
478    };
479 
480    device->physical->cmd_capture_data(batch, device, dst_addr, src_addr, size_B);
481 }
482 
483 static const void *
anv_utrace_get_data(struct u_trace_context * utctx,void * buffer,uint64_t offset_B,uint32_t size_B)484 anv_utrace_get_data(struct u_trace_context *utctx,
485                      void *buffer, uint64_t offset_B, uint32_t size_B)
486 {
487    struct anv_bo *bo = buffer;
488 
489    return bo->map + offset_B;
490 }
491 
492 void
anv_device_utrace_init(struct anv_device * device)493 anv_device_utrace_init(struct anv_device *device)
494 {
495    device->utrace_timestamp_size = sizeof(union anv_utrace_timestamp);
496 
497    anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace",
498                     ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_HOST_CACHED_COHERENT);
499    intel_ds_device_init(&device->ds, device->info, device->fd,
500                         device->physical->local_minor,
501                         INTEL_DS_API_VULKAN);
502    u_trace_context_init(&device->ds.trace_context,
503                         &device->ds,
504                         device->utrace_timestamp_size,
505                         12,
506                         anv_utrace_create_buffer,
507                         anv_utrace_destroy_buffer,
508                         anv_utrace_record_ts,
509                         anv_utrace_read_ts,
510                         anv_utrace_capture_data,
511                         anv_utrace_get_data,
512                         anv_utrace_delete_submit);
513 
514    for (uint32_t q = 0; q < device->queue_count; q++) {
515       struct anv_queue *queue = &device->queues[q];
516 
517       intel_ds_device_init_queue(&device->ds, &queue->ds, "%s%u",
518                                  intel_engines_class_to_string(queue->family->engine_class),
519                                  queue->vk.index_in_family);
520    }
521 }
522 
523 void
anv_device_utrace_finish(struct anv_device * device)524 anv_device_utrace_finish(struct anv_device *device)
525 {
526    intel_ds_device_process(&device->ds, true);
527    intel_ds_device_fini(&device->ds);
528    anv_bo_pool_finish(&device->utrace_bo_pool);
529 }
530 
531 enum intel_ds_stall_flag
anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)532 anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
533 {
534    static const struct {
535       enum anv_pipe_bits anv;
536       enum intel_ds_stall_flag ds;
537    } anv_to_ds_flags[] = {
538       { .anv = ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,            .ds = INTEL_DS_DEPTH_CACHE_FLUSH_BIT, },
539       { .anv = ANV_PIPE_DATA_CACHE_FLUSH_BIT,             .ds = INTEL_DS_DATA_CACHE_FLUSH_BIT, },
540       { .anv = ANV_PIPE_TILE_CACHE_FLUSH_BIT,             .ds = INTEL_DS_TILE_CACHE_FLUSH_BIT, },
541       { .anv = ANV_PIPE_L3_FABRIC_FLUSH_BIT,              .ds = INTEL_DS_L3_FABRIC_FLUSH_BIT, },
542       { .anv = ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,    .ds = INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT, },
543       { .anv = ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,       .ds = INTEL_DS_STATE_CACHE_INVALIDATE_BIT, },
544       { .anv = ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,    .ds = INTEL_DS_CONST_CACHE_INVALIDATE_BIT, },
545       { .anv = ANV_PIPE_VF_CACHE_INVALIDATE_BIT,          .ds = INTEL_DS_VF_CACHE_INVALIDATE_BIT, },
546       { .anv = ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,     .ds = INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT, },
547       { .anv = ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_INST_CACHE_INVALIDATE_BIT, },
548       { .anv = ANV_PIPE_DEPTH_STALL_BIT,                  .ds = INTEL_DS_DEPTH_STALL_BIT, },
549       { .anv = ANV_PIPE_CS_STALL_BIT,                     .ds = INTEL_DS_CS_STALL_BIT, },
550       { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT,           .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, },
551       { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT,          .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, },
552       { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, },
553       { .anv = ANV_PIPE_PSS_STALL_SYNC_BIT,               .ds = INTEL_DS_PSS_STALL_SYNC_BIT, },
554       { .anv = ANV_PIPE_END_OF_PIPE_SYNC_BIT,             .ds = INTEL_DS_END_OF_PIPE_BIT, },
555       { .anv = ANV_PIPE_CCS_CACHE_FLUSH_BIT,              .ds = INTEL_DS_CCS_CACHE_FLUSH_BIT, },
556    };
557 
558    enum intel_ds_stall_flag ret = 0;
559    for (uint32_t i = 0; i < ARRAY_SIZE(anv_to_ds_flags); i++) {
560       if (anv_to_ds_flags[i].anv & bits)
561          ret |= anv_to_ds_flags[i].ds;
562    }
563 
564    return ret;
565 }
566 
anv_CmdBeginDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer,const VkDebugUtilsLabelEXT * pLabelInfo)567 void anv_CmdBeginDebugUtilsLabelEXT(
568    VkCommandBuffer _commandBuffer,
569    const VkDebugUtilsLabelEXT *pLabelInfo)
570 {
571    VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
572 
573    vk_common_CmdBeginDebugUtilsLabelEXT(_commandBuffer, pLabelInfo);
574 
575    trace_intel_begin_cmd_buffer_annotation(&cmd_buffer->trace);
576 }
577 
anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)578 void anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)
579 {
580    VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
581 
582    if (cmd_buffer->vk.labels.size > 0) {
583       const VkDebugUtilsLabelEXT *label =
584          util_dynarray_top_ptr(&cmd_buffer->vk.labels, VkDebugUtilsLabelEXT);
585 
586       trace_intel_end_cmd_buffer_annotation(&cmd_buffer->trace,
587                                             strlen(label->pLabelName),
588                                             label->pLabelName);
589    }
590 
591    vk_common_CmdEndDebugUtilsLabelEXT(_commandBuffer);
592 }
593 
594 void
anv_queue_trace(struct anv_queue * queue,const char * label,bool frame,bool begin)595 anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool begin)
596 {
597    struct anv_device *device = queue->device;
598 
599    VkResult result;
600    struct anv_utrace_submit *submit =
601       vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
602                 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
603    if (!submit)
604       return;
605 
606    result = anv_async_submit_init(&submit->base, queue,
607                                   &device->batch_bo_pool,
608                                   false, true);
609    if (result != VK_SUCCESS)
610       goto error_async;
611 
612    intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
613 
614    struct anv_batch *batch = &submit->base.batch;
615    if (frame) {
616       if (begin)
617          trace_intel_begin_frame(&submit->ds.trace, batch);
618       else
619          trace_intel_end_frame(&submit->ds.trace, batch,
620                                device->debug_frame_desc->frame_id);
621    } else {
622       if (begin) {
623          trace_intel_begin_queue_annotation(&submit->ds.trace, batch);
624       } else {
625          trace_intel_end_queue_annotation(&submit->ds.trace, batch,
626                                           strlen(label), label);
627       }
628    }
629 
630    anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_END, bbs);
631    anv_batch_emit(batch, GFX9_MI_NOOP, noop);
632 
633    if (batch->status != VK_SUCCESS) {
634       result = batch->status;
635       goto error_batch;
636    }
637 
638    intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds,
639                              device->vk.current_frame, true);
640 
641    result =
642       device->kmd_backend->queue_exec_async(&submit->base,
643                                             0, NULL, 0, NULL);
644    if (result != VK_SUCCESS)
645       goto error_batch;
646 
647    if (frame && !begin)
648       intel_ds_device_process(&device->ds, true);
649 
650    return;
651 
652  error_batch:
653    intel_ds_flush_data_fini(&submit->ds);
654    anv_async_submit_fini(&submit->base);
655  error_async:
656    vk_free(&device->vk.alloc, submit);
657 }
658 
659 void
anv_QueueBeginDebugUtilsLabelEXT(VkQueue _queue,const VkDebugUtilsLabelEXT * pLabelInfo)660 anv_QueueBeginDebugUtilsLabelEXT(
661    VkQueue _queue,
662    const VkDebugUtilsLabelEXT *pLabelInfo)
663 {
664    VK_FROM_HANDLE(anv_queue, queue, _queue);
665 
666    vk_common_QueueBeginDebugUtilsLabelEXT(_queue, pLabelInfo);
667 
668    anv_queue_trace(queue, pLabelInfo->pLabelName,
669                    false /* frame */, true /* begin */);
670 }
671 
672 void
anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)673 anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)
674 {
675    VK_FROM_HANDLE(anv_queue, queue, _queue);
676 
677    if (queue->vk.labels.size > 0) {
678       const VkDebugUtilsLabelEXT *label =
679          util_dynarray_top_ptr(&queue->vk.labels, VkDebugUtilsLabelEXT);
680       anv_queue_trace(queue, label->pLabelName,
681                       false /* frame */, false /* begin */);
682 
683       intel_ds_device_process(&queue->device->ds, true);
684    }
685 
686    vk_common_QueueEndDebugUtilsLabelEXT(_queue);
687 }
688