1 /*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25 #include "anv_internal_kernels.h"
26
27 #include "ds/intel_tracepoints.h"
28 #include "genxml/gen9_pack.h"
29 #include "perf/intel_perf.h"
30 #include "util/perf/cpu_trace.h"
31
32 #include "vulkan/runtime/vk_common_entrypoints.h"
33
34 /** Timestamp structure format */
35 union anv_utrace_timestamp {
36 /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
37 * PIPE_CONTROL.
38 */
39 uint64_t timestamp;
40
41 /* Timestamp written by COMPUTE_WALKER::PostSync
42 *
43 * Layout is described in PRMs.
44 * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
45 *
46 * "The timestamp layout :
47 * [0] = 32b Context Timestamp Start
48 * [1] = 32b Global Timestamp Start
49 * [2] = 32b Context Timestamp End
50 * [3] = 32b Global Timestamp End"
51 */
52 uint32_t compute_walker[4];
53 };
54
55 static uint32_t
command_buffers_count_utraces(struct anv_device * device,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,uint32_t * utrace_copies)56 command_buffers_count_utraces(struct anv_device *device,
57 uint32_t cmd_buffer_count,
58 struct anv_cmd_buffer **cmd_buffers,
59 uint32_t *utrace_copies)
60 {
61 if (!u_trace_should_process(&device->ds.trace_context))
62 return 0;
63
64 uint32_t utraces = 0;
65 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
66 if (u_trace_has_points(&cmd_buffers[i]->trace)) {
67 utraces++;
68 if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
69 *utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks);
70 }
71 }
72
73 return utraces;
74 }
75
76 static void
anv_utrace_delete_submit(struct u_trace_context * utctx,void * submit_data)77 anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data)
78 {
79 struct anv_device *device =
80 container_of(utctx, struct anv_device, ds.trace_context);
81 struct anv_utrace_submit *submit = submit_data;
82
83 intel_ds_flush_data_fini(&submit->ds);
84
85 anv_state_stream_finish(&submit->dynamic_state_stream);
86 anv_state_stream_finish(&submit->general_state_stream);
87
88 if (submit->trace_bo)
89 anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
90
91 util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
92 anv_bo_pool_free(&device->utrace_bo_pool, *bo);
93 util_dynarray_fini(&submit->batch_bos);
94
95 vk_sync_destroy(&device->vk, submit->sync);
96
97 vk_free(&device->vk.alloc, submit);
98 }
99
100 static void
anv_device_utrace_emit_gfx_copy_ts_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint32_t from_offset,void * ts_to,uint32_t to_offset,uint32_t count)101 anv_device_utrace_emit_gfx_copy_ts_buffer(struct u_trace_context *utctx,
102 void *cmdstream,
103 void *ts_from, uint32_t from_offset,
104 void *ts_to, uint32_t to_offset,
105 uint32_t count)
106 {
107 struct anv_device *device =
108 container_of(utctx, struct anv_device, ds.trace_context);
109 struct anv_utrace_submit *submit = cmdstream;
110 struct anv_address from_addr = (struct anv_address) {
111 .bo = ts_from, .offset = from_offset * sizeof(union anv_utrace_timestamp) };
112 struct anv_address to_addr = (struct anv_address) {
113 .bo = ts_to, .offset = to_offset * sizeof(union anv_utrace_timestamp) };
114
115 anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
116 to_addr, from_addr,
117 count * sizeof(union anv_utrace_timestamp));
118 }
119
120 static void
anv_device_utrace_emit_cs_copy_ts_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint32_t from_offset,void * ts_to,uint32_t to_offset,uint32_t count)121 anv_device_utrace_emit_cs_copy_ts_buffer(struct u_trace_context *utctx,
122 void *cmdstream,
123 void *ts_from, uint32_t from_offset,
124 void *ts_to, uint32_t to_offset,
125 uint32_t count)
126 {
127 struct anv_device *device =
128 container_of(utctx, struct anv_device, ds.trace_context);
129 struct anv_utrace_submit *submit = cmdstream;
130 struct anv_address from_addr = (struct anv_address) {
131 .bo = ts_from, .offset = from_offset * sizeof(union anv_utrace_timestamp) };
132 struct anv_address to_addr = (struct anv_address) {
133 .bo = ts_to, .offset = to_offset * sizeof(union anv_utrace_timestamp) };
134
135 struct anv_state push_data_state =
136 anv_genX(device->info, simple_shader_alloc_push)(
137 &submit->simple_state, sizeof(struct anv_memcpy_params));
138 struct anv_memcpy_params *params = push_data_state.map;
139
140 *params = (struct anv_memcpy_params) {
141 .num_dwords = count * sizeof(union anv_utrace_timestamp) / 4,
142 .src_addr = anv_address_physical(from_addr),
143 .dst_addr = anv_address_physical(to_addr),
144 };
145
146 anv_genX(device->info, emit_simple_shader_dispatch)(
147 &submit->simple_state, DIV_ROUND_UP(params->num_dwords, 4),
148 push_data_state);
149 }
150
151 static VkResult
anv_utrace_submit_extend_batch(struct anv_batch * batch,uint32_t size,void * user_data)152 anv_utrace_submit_extend_batch(struct anv_batch *batch, uint32_t size,
153 void *user_data)
154 {
155 struct anv_utrace_submit *submit = user_data;
156
157 uint32_t alloc_size = 0;
158 util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
159 alloc_size += (*bo)->size;
160 alloc_size = MAX2(alloc_size * 2, 8192);
161
162 struct anv_bo *bo;
163 VkResult result = anv_bo_pool_alloc(&submit->queue->device->utrace_bo_pool,
164 align(alloc_size, 4096),
165 &bo);
166 if (result != VK_SUCCESS)
167 return result;
168
169 util_dynarray_append(&submit->batch_bos, struct anv_bo *, bo);
170
171 batch->end += 4 * GFX9_MI_BATCH_BUFFER_START_length;
172
173 anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
174 bbs.DWordLength = GFX9_MI_BATCH_BUFFER_START_length -
175 GFX9_MI_BATCH_BUFFER_START_length_bias;
176 bbs.SecondLevelBatchBuffer = Firstlevelbatch;
177 bbs.AddressSpaceIndicator = ASI_PPGTT;
178 bbs.BatchBufferStartAddress = (struct anv_address) { bo, 0 };
179 }
180
181 anv_batch_set_storage(batch,
182 (struct anv_address) { .bo = bo, },
183 bo->map,
184 bo->size - 4 * GFX9_MI_BATCH_BUFFER_START_length);
185
186 return VK_SUCCESS;
187 }
188
189 VkResult
anv_device_utrace_flush_cmd_buffers(struct anv_queue * queue,uint32_t cmd_buffer_count,struct anv_cmd_buffer ** cmd_buffers,struct anv_utrace_submit ** out_submit)190 anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
191 uint32_t cmd_buffer_count,
192 struct anv_cmd_buffer **cmd_buffers,
193 struct anv_utrace_submit **out_submit)
194 {
195 struct anv_device *device = queue->device;
196 uint32_t utrace_copies = 0;
197 uint32_t utraces = command_buffers_count_utraces(device,
198 cmd_buffer_count,
199 cmd_buffers,
200 &utrace_copies);
201 if (!utraces) {
202 *out_submit = NULL;
203 return VK_SUCCESS;
204 }
205
206 VkResult result;
207 struct anv_utrace_submit *submit =
208 vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
209 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
210 if (!submit)
211 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
212
213 submit->queue = queue;
214
215 intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
216
217 result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
218 0, 0, &submit->sync);
219 if (result != VK_SUCCESS)
220 goto error_sync;
221
222 util_dynarray_init(&submit->batch_bos, NULL);
223
224 if (utrace_copies > 0) {
225 result = anv_bo_pool_alloc(&device->utrace_bo_pool,
226 utrace_copies * 4096,
227 &submit->trace_bo);
228 if (result != VK_SUCCESS)
229 goto error_trace_buf;
230
231 const bool uses_relocs = device->physical->uses_relocs;
232 result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
233 if (result != VK_SUCCESS)
234 goto error_reloc_list;
235
236 anv_state_stream_init(&submit->dynamic_state_stream,
237 &device->dynamic_state_pool, 16384);
238 anv_state_stream_init(&submit->general_state_stream,
239 &device->general_state_pool, 16384);
240
241 submit->batch = (struct anv_batch) {
242 .alloc = &device->vk.alloc,
243 .relocs = &submit->relocs,
244 .user_data = submit,
245 .extend_cb = anv_utrace_submit_extend_batch,
246 };
247
248 /* Only engine class where we support timestamp copies
249 *
250 * TODO: add INTEL_ENGINE_CLASS_COPY support (should be trivial ;)
251 */
252 assert(queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER ||
253 queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE);
254 if (queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER) {
255
256 trace_intel_begin_trace_copy_cb(&submit->ds.trace, &submit->batch);
257
258 anv_genX(device->info, emit_so_memcpy_init)(&submit->memcpy_state,
259 device,
260 &submit->batch);
261 uint32_t num_traces = 0;
262 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
263 if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
264 intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
265 &submit->ds, false);
266 } else {
267 num_traces += cmd_buffers[i]->trace.num_traces;
268 u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
269 u_trace_end_iterator(&cmd_buffers[i]->trace),
270 &submit->ds.trace,
271 submit,
272 anv_device_utrace_emit_gfx_copy_ts_buffer);
273 }
274 }
275 anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
276
277 trace_intel_end_trace_copy_cb(&submit->ds.trace, &submit->batch,
278 num_traces);
279
280 anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
281 } else {
282 trace_intel_begin_trace_copy_cb(&submit->ds.trace, &submit->batch);
283
284 submit->simple_state = (struct anv_simple_shader) {
285 .device = device,
286 .dynamic_state_stream = &submit->dynamic_state_stream,
287 .general_state_stream = &submit->general_state_stream,
288 .batch = &submit->batch,
289 .kernel = device->internal_kernels[
290 ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE],
291 .l3_config = device->internal_kernels_l3_config,
292 };
293 anv_genX(device->info, emit_simple_shader_init)(&submit->simple_state);
294
295 uint32_t num_traces = 0;
296 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
297 num_traces += cmd_buffers[i]->trace.num_traces;
298 if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
299 intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
300 &submit->ds, false);
301 } else {
302 num_traces += cmd_buffers[i]->trace.num_traces;
303 u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
304 u_trace_end_iterator(&cmd_buffers[i]->trace),
305 &submit->ds.trace,
306 submit,
307 anv_device_utrace_emit_cs_copy_ts_buffer);
308 }
309 }
310
311 trace_intel_end_trace_copy_cb(&submit->ds.trace, &submit->batch,
312 num_traces);
313
314 anv_genX(device->info, emit_simple_shader_end)(&submit->simple_state);
315 }
316
317 intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds, true);
318
319 if (submit->batch.status != VK_SUCCESS) {
320 result = submit->batch.status;
321 goto error_batch;
322 }
323 } else {
324 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
325 assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
326 intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
327 &submit->ds, i == (cmd_buffer_count - 1));
328 }
329 }
330
331 *out_submit = submit;
332
333 return VK_SUCCESS;
334
335 error_batch:
336 anv_reloc_list_finish(&submit->relocs);
337 util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
338 anv_bo_pool_free(&device->utrace_bo_pool, *bo);
339 error_reloc_list:
340 anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
341 error_trace_buf:
342 vk_sync_destroy(&device->vk, submit->sync);
343 error_sync:
344 intel_ds_flush_data_fini(&submit->ds);
345 vk_free(&device->vk.alloc, submit);
346 return result;
347 }
348
349 static void *
anv_utrace_create_ts_buffer(struct u_trace_context * utctx,uint32_t size_b)350 anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
351 {
352 struct anv_device *device =
353 container_of(utctx, struct anv_device, ds.trace_context);
354
355 uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) *
356 sizeof(union anv_utrace_timestamp);
357
358 struct anv_bo *bo = NULL;
359 UNUSED VkResult result =
360 anv_bo_pool_alloc(&device->utrace_bo_pool,
361 align(anv_ts_size_b, 4096),
362 &bo);
363 assert(result == VK_SUCCESS);
364
365 memset(bo->map, 0, bo->size);
366 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
367 if (device->physical->memory.need_flush &&
368 anv_bo_needs_host_cache_flush(bo->alloc_flags))
369 intel_flush_range(bo->map, bo->size);
370 #endif
371
372 return bo;
373 }
374
375 static void
anv_utrace_destroy_ts_buffer(struct u_trace_context * utctx,void * timestamps)376 anv_utrace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps)
377 {
378 struct anv_device *device =
379 container_of(utctx, struct anv_device, ds.trace_context);
380 struct anv_bo *bo = timestamps;
381
382 anv_bo_pool_free(&device->utrace_bo_pool, bo);
383 }
384
385 static void
anv_utrace_record_ts(struct u_trace * ut,void * cs,void * timestamps,unsigned idx,bool end_of_pipe)386 anv_utrace_record_ts(struct u_trace *ut, void *cs,
387 void *timestamps, unsigned idx,
388 bool end_of_pipe)
389 {
390 struct anv_device *device =
391 container_of(ut->utctx, struct anv_device, ds.trace_context);
392 struct anv_cmd_buffer *cmd_buffer =
393 container_of(ut, struct anv_cmd_buffer, trace);
394 /* cmd_buffer is only valid if cs == NULL */
395 struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
396 struct anv_bo *bo = timestamps;
397
398 struct anv_address ts_address = (struct anv_address) {
399 .bo = bo,
400 .offset = idx * sizeof(union anv_utrace_timestamp)
401 };
402
403 /* Is this a end of compute trace point? */
404 const bool is_end_compute =
405 cs == NULL &&
406 (cmd_buffer->last_compute_walker != NULL ||
407 cmd_buffer->last_indirect_dispatch != NULL) &&
408 end_of_pipe;
409
410 enum anv_timestamp_capture_type capture_type = end_of_pipe ?
411 (is_end_compute ?
412 (cmd_buffer->last_indirect_dispatch != NULL ?
413 ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH : ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER) :
414 ANV_TIMESTAMP_CAPTURE_END_OF_PIPE) : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
415
416 void *addr = capture_type == ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH ?
417 cmd_buffer->last_indirect_dispatch :
418 capture_type == ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER ?
419 cmd_buffer->last_compute_walker : NULL;
420
421 device->physical->cmd_emit_timestamp(batch, device, ts_address,
422 capture_type,
423 addr);
424 if (is_end_compute) {
425 cmd_buffer->last_compute_walker = NULL;
426 cmd_buffer->last_indirect_dispatch = NULL;
427 }
428 }
429
430 static uint64_t
anv_utrace_read_ts(struct u_trace_context * utctx,void * timestamps,unsigned idx,void * flush_data)431 anv_utrace_read_ts(struct u_trace_context *utctx,
432 void *timestamps, unsigned idx, void *flush_data)
433 {
434 struct anv_device *device =
435 container_of(utctx, struct anv_device, ds.trace_context);
436 struct anv_bo *bo = timestamps;
437 struct anv_utrace_submit *submit = flush_data;
438
439 /* Only need to stall on results for the first entry: */
440 if (idx == 0) {
441 MESA_TRACE_SCOPE("anv utrace wait timestamps");
442 UNUSED VkResult result =
443 vk_sync_wait(&device->vk,
444 submit->sync,
445 0,
446 VK_SYNC_WAIT_COMPLETE,
447 os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
448 assert(result == VK_SUCCESS);
449 }
450
451 union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map;
452
453 /* Don't translate the no-timestamp marker: */
454 if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
455 return U_TRACE_NO_TIMESTAMP;
456
457 /* Detect a 16bytes timestamp write */
458 if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
459 /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
460 * need to rebuild the full 64bits using the previous timestamp. We
461 * assume that utrace is reading the timestamp in order. Anyway
462 * timestamp rollover on 32bits in a few minutes so in most cases that
463 * should be correct.
464 */
465 uint64_t timestamp =
466 (submit->last_full_timestamp & 0xffffffff00000000) |
467 (uint64_t) ts[idx].compute_walker[3];
468
469 return intel_device_info_timebase_scale(device->info, timestamp);
470 }
471
472 submit->last_full_timestamp = ts[idx].timestamp;
473
474 return intel_device_info_timebase_scale(device->info, ts[idx].timestamp);
475 }
476
477 void
anv_device_utrace_init(struct anv_device * device)478 anv_device_utrace_init(struct anv_device *device)
479 {
480 anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace",
481 ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_HOST_CACHED_COHERENT);
482 intel_ds_device_init(&device->ds, device->info, device->fd,
483 device->physical->local_minor,
484 INTEL_DS_API_VULKAN);
485 u_trace_context_init(&device->ds.trace_context,
486 &device->ds,
487 anv_utrace_create_ts_buffer,
488 anv_utrace_destroy_ts_buffer,
489 anv_utrace_record_ts,
490 anv_utrace_read_ts,
491 anv_utrace_delete_submit);
492
493 for (uint32_t q = 0; q < device->queue_count; q++) {
494 struct anv_queue *queue = &device->queues[q];
495
496 intel_ds_device_init_queue(&device->ds, &queue->ds, "%s%u",
497 intel_engines_class_to_string(queue->family->engine_class),
498 queue->vk.index_in_family);
499 }
500 }
501
502 void
anv_device_utrace_finish(struct anv_device * device)503 anv_device_utrace_finish(struct anv_device *device)
504 {
505 intel_ds_device_process(&device->ds, true);
506 intel_ds_device_fini(&device->ds);
507 anv_bo_pool_finish(&device->utrace_bo_pool);
508 }
509
510 enum intel_ds_stall_flag
anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)511 anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
512 {
513 static const struct {
514 enum anv_pipe_bits anv;
515 enum intel_ds_stall_flag ds;
516 } anv_to_ds_flags[] = {
517 { .anv = ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, .ds = INTEL_DS_DEPTH_CACHE_FLUSH_BIT, },
518 { .anv = ANV_PIPE_DATA_CACHE_FLUSH_BIT, .ds = INTEL_DS_DATA_CACHE_FLUSH_BIT, },
519 { .anv = ANV_PIPE_TILE_CACHE_FLUSH_BIT, .ds = INTEL_DS_TILE_CACHE_FLUSH_BIT, },
520 { .anv = ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, .ds = INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT, },
521 { .anv = ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_STATE_CACHE_INVALIDATE_BIT, },
522 { .anv = ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_CONST_CACHE_INVALIDATE_BIT, },
523 { .anv = ANV_PIPE_VF_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_VF_CACHE_INVALIDATE_BIT, },
524 { .anv = ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT, },
525 { .anv = ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_INST_CACHE_INVALIDATE_BIT, },
526 { .anv = ANV_PIPE_DEPTH_STALL_BIT, .ds = INTEL_DS_DEPTH_STALL_BIT, },
527 { .anv = ANV_PIPE_CS_STALL_BIT, .ds = INTEL_DS_CS_STALL_BIT, },
528 { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT, .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, },
529 { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT, .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, },
530 { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, },
531 { .anv = ANV_PIPE_PSS_STALL_SYNC_BIT, .ds = INTEL_DS_PSS_STALL_SYNC_BIT, },
532 { .anv = ANV_PIPE_END_OF_PIPE_SYNC_BIT, .ds = INTEL_DS_END_OF_PIPE_BIT, },
533 { .anv = ANV_PIPE_CCS_CACHE_FLUSH_BIT, .ds = INTEL_DS_CCS_CACHE_FLUSH_BIT, },
534 };
535
536 enum intel_ds_stall_flag ret = 0;
537 for (uint32_t i = 0; i < ARRAY_SIZE(anv_to_ds_flags); i++) {
538 if (anv_to_ds_flags[i].anv & bits)
539 ret |= anv_to_ds_flags[i].ds;
540 }
541
542 return ret;
543 }
544
anv_CmdBeginDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer,const VkDebugUtilsLabelEXT * pLabelInfo)545 void anv_CmdBeginDebugUtilsLabelEXT(
546 VkCommandBuffer _commandBuffer,
547 const VkDebugUtilsLabelEXT *pLabelInfo)
548 {
549 VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
550
551 vk_common_CmdBeginDebugUtilsLabelEXT(_commandBuffer, pLabelInfo);
552
553 trace_intel_begin_cmd_buffer_annotation(&cmd_buffer->trace);
554 }
555
anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)556 void anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)
557 {
558 VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
559
560 if (cmd_buffer->vk.labels.size > 0) {
561 const VkDebugUtilsLabelEXT *label =
562 util_dynarray_top_ptr(&cmd_buffer->vk.labels, VkDebugUtilsLabelEXT);
563
564 trace_intel_end_cmd_buffer_annotation(&cmd_buffer->trace,
565 strlen(label->pLabelName),
566 label->pLabelName);
567 }
568
569 vk_common_CmdEndDebugUtilsLabelEXT(_commandBuffer);
570 }
571
572 void
anv_queue_trace(struct anv_queue * queue,const char * label,bool frame,bool begin)573 anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool begin)
574 {
575 struct anv_device *device = queue->device;
576
577 VkResult result;
578 struct anv_utrace_submit *submit =
579 vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
580 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
581 if (!submit)
582 return;
583
584 submit->queue = queue;
585
586 intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
587
588 result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
589 0, 0, &submit->sync);
590 if (result != VK_SUCCESS)
591 goto error_trace;
592
593 const bool uses_relocs = device->physical->uses_relocs;
594 result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
595 if (result != VK_SUCCESS)
596 goto error_sync;
597
598 submit->batch = (struct anv_batch) {
599 .alloc = &device->vk.alloc,
600 .relocs = &submit->relocs,
601 .user_data = submit,
602 .extend_cb = anv_utrace_submit_extend_batch,
603 };
604
605 if (frame) {
606 if (begin)
607 trace_intel_begin_frame(&submit->ds.trace, &submit->batch);
608 else
609 trace_intel_end_frame(&submit->ds.trace, &submit->batch,
610 device->debug_frame_desc->frame_id);
611 } else {
612 if (begin) {
613 trace_intel_begin_queue_annotation(&submit->ds.trace, &submit->batch);
614 } else {
615 trace_intel_end_queue_annotation(&submit->ds.trace,
616 &submit->batch,
617 strlen(label),
618 label);
619 }
620 }
621
622 anv_batch_emit(&submit->batch, GFX9_MI_BATCH_BUFFER_END, bbs);
623 anv_batch_emit(&submit->batch, GFX9_MI_NOOP, noop);
624
625 if (submit->batch.status != VK_SUCCESS) {
626 result = submit->batch.status;
627 goto error_reloc_list;
628 }
629
630 intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds, true);
631
632 pthread_mutex_lock(&device->mutex);
633 device->kmd_backend->queue_exec_trace(queue, submit);
634 pthread_mutex_unlock(&device->mutex);
635
636 return;
637
638 error_reloc_list:
639 anv_reloc_list_finish(&submit->relocs);
640 util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
641 anv_bo_pool_free(&device->utrace_bo_pool, *bo);
642 error_sync:
643 vk_sync_destroy(&device->vk, submit->sync);
644 error_trace:
645 intel_ds_flush_data_fini(&submit->ds);
646 vk_free(&device->vk.alloc, submit);
647 }
648
649 void
anv_QueueBeginDebugUtilsLabelEXT(VkQueue _queue,const VkDebugUtilsLabelEXT * pLabelInfo)650 anv_QueueBeginDebugUtilsLabelEXT(
651 VkQueue _queue,
652 const VkDebugUtilsLabelEXT *pLabelInfo)
653 {
654 VK_FROM_HANDLE(anv_queue, queue, _queue);
655
656 vk_common_QueueBeginDebugUtilsLabelEXT(_queue, pLabelInfo);
657
658 anv_queue_trace(queue, pLabelInfo->pLabelName,
659 false /* frame */, true /* begin */);
660 }
661
662 void
anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)663 anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)
664 {
665 VK_FROM_HANDLE(anv_queue, queue, _queue);
666
667 if (queue->vk.labels.size > 0) {
668 const VkDebugUtilsLabelEXT *label =
669 util_dynarray_top_ptr(&queue->vk.labels, VkDebugUtilsLabelEXT);
670 anv_queue_trace(queue, label->pLabelName,
671 false /* frame */, false /* begin */);
672
673 intel_ds_device_process(&queue->device->ds, true);
674 }
675
676 vk_common_QueueEndDebugUtilsLabelEXT(_queue);
677 }
678