1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include <inttypes.h>
8
9 #include "radv_buffer.h"
10 #include "radv_cs.h"
11 #include "radv_debug.h"
12 #include "radv_entrypoints.h"
13 #include "radv_perfcounter.h"
14 #include "radv_spm.h"
15 #include "radv_sqtt.h"
16 #include "sid.h"
17
18 #include "ac_pm4.h"
19
20 #include "vk_command_pool.h"
21 #include "vk_common_entrypoints.h"
22
23 bool
radv_is_instruction_timing_enabled(void)24 radv_is_instruction_timing_enabled(void)
25 {
26 return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true);
27 }
28
29 bool
radv_sqtt_queue_events_enabled(void)30 radv_sqtt_queue_events_enabled(void)
31 {
32 return debug_get_bool_option("RADV_THREAD_TRACE_QUEUE_EVENTS", true);
33 }
34
35 static enum radv_queue_family
radv_ip_to_queue_family(enum amd_ip_type t)36 radv_ip_to_queue_family(enum amd_ip_type t)
37 {
38 switch (t) {
39 case AMD_IP_GFX:
40 return RADV_QUEUE_GENERAL;
41 case AMD_IP_COMPUTE:
42 return RADV_QUEUE_COMPUTE;
43 case AMD_IP_SDMA:
44 return RADV_QUEUE_TRANSFER;
45 default:
46 unreachable("Unknown IP type");
47 }
48 }
49
50 static void
radv_emit_wait_for_idle(const struct radv_device * device,struct radeon_cmdbuf * cs,int family)51 radv_emit_wait_for_idle(const struct radv_device *device, struct radeon_cmdbuf *cs, int family)
52 {
53 const struct radv_physical_device *pdev = radv_device_physical(device);
54 const enum radv_queue_family qf = radv_ip_to_queue_family(family);
55 enum rgp_flush_bits sqtt_flush_bits = 0;
56 radv_cs_emit_cache_flush(
57 device->ws, cs, pdev->info.gfx_level, NULL, 0, qf,
58 (family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH
59 : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
60 RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2,
61 &sqtt_flush_bits, 0);
62 }
63
64 static void
radv_emit_sqtt_start(const struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)65 radv_emit_sqtt_start(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
66 {
67 const struct radv_physical_device *pdev = radv_device_physical(device);
68 const bool is_compute_queue = qf == RADV_QUEUE_COMPUTE;
69 struct ac_pm4_state *pm4;
70
71 pm4 = ac_pm4_create_sized(&pdev->info, false, 512, is_compute_queue);
72 if (!pm4)
73 return;
74
75 ac_sqtt_emit_start(&pdev->info, pm4, &device->sqtt, is_compute_queue);
76 ac_pm4_finalize(pm4);
77
78 radeon_check_space(device->ws, cs, pm4->ndw);
79 radeon_emit_array(cs, pm4->pm4, pm4->ndw);
80
81 ac_pm4_free_state(pm4);
82 }
83
84 static void
radv_emit_sqtt_stop(const struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)85 radv_emit_sqtt_stop(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
86 {
87 const struct radv_physical_device *pdev = radv_device_physical(device);
88 const bool is_compute_queue = qf == RADV_QUEUE_COMPUTE;
89 struct ac_pm4_state *pm4;
90
91 pm4 = ac_pm4_create_sized(&pdev->info, false, 512, is_compute_queue);
92 if (!pm4)
93 return;
94
95 ac_sqtt_emit_stop(&pdev->info, pm4, is_compute_queue);
96 ac_pm4_finalize(pm4);
97
98 radeon_check_space(device->ws, cs, pm4->ndw);
99 radeon_emit_array(cs, pm4->pm4, pm4->ndw);
100
101 ac_pm4_clear_state(pm4, &pdev->info, false, is_compute_queue);
102
103 if (pdev->info.has_sqtt_rb_harvest_bug) {
104 /* Some chips with disabled RBs should wait for idle because FINISH_DONE doesn't work. */
105 radv_emit_wait_for_idle(device, cs, qf);
106 }
107
108 ac_sqtt_emit_wait(&pdev->info, pm4, &device->sqtt, is_compute_queue);
109 ac_pm4_finalize(pm4);
110
111 radeon_check_space(device->ws, cs, pm4->ndw);
112 radeon_emit_array(cs, pm4->pm4, pm4->ndw);
113
114 ac_pm4_free_state(pm4);
115 }
116
117 void
radv_emit_sqtt_userdata(const struct radv_cmd_buffer * cmd_buffer,const void * data,uint32_t num_dwords)118 radv_emit_sqtt_userdata(const struct radv_cmd_buffer *cmd_buffer, const void *data, uint32_t num_dwords)
119 {
120 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
121 const struct radv_physical_device *pdev = radv_device_physical(device);
122 const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
123 const enum radv_queue_family qf = cmd_buffer->qf;
124 struct radeon_cmdbuf *cs = cmd_buffer->cs;
125 const uint32_t *dwords = (uint32_t *)data;
126
127 /* SQTT user data packets aren't supported on SDMA queues. */
128 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
129 return;
130
131 while (num_dwords > 0) {
132 uint32_t count = MIN2(num_dwords, 2);
133
134 radeon_check_space(device->ws, cs, 2 + count);
135
136 /* Without the perfctr bit the CP might not always pass the
137 * write on correctly. */
138 if (pdev->info.gfx_level >= GFX10)
139 radeon_set_uconfig_perfctr_reg_seq(gfx_level, qf, cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
140 else
141 radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
142 radeon_emit_array(cs, dwords, count);
143
144 dwords += count;
145 num_dwords -= count;
146 }
147 }
148
149 void
radv_emit_spi_config_cntl(const struct radv_device * device,struct radeon_cmdbuf * cs,bool enable)150 radv_emit_spi_config_cntl(const struct radv_device *device, struct radeon_cmdbuf *cs, bool enable)
151 {
152 const struct radv_physical_device *pdev = radv_device_physical(device);
153
154 if (pdev->info.gfx_level >= GFX9) {
155 uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) |
156 S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable);
157
158 if (pdev->info.gfx_level >= GFX10)
159 spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
160
161 radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
162 } else {
163 /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
164 radeon_set_privileged_config_reg(cs, R_009100_SPI_CONFIG_CNTL,
165 S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable));
166 }
167 }
168
169 void
radv_emit_inhibit_clockgating(const struct radv_device * device,struct radeon_cmdbuf * cs,bool inhibit)170 radv_emit_inhibit_clockgating(const struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit)
171 {
172 const struct radv_physical_device *pdev = radv_device_physical(device);
173
174 if (pdev->info.gfx_level >= GFX11)
175 return; /* not needed */
176
177 if (pdev->info.gfx_level >= GFX10) {
178 radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL, S_037390_PERFMON_CLOCK_STATE(inhibit));
179 } else if (pdev->info.gfx_level >= GFX8) {
180 radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL, S_0372FC_PERFMON_CLOCK_STATE(inhibit));
181 }
182 }
183
184 VkResult
radv_sqtt_acquire_gpu_timestamp(struct radv_device * device,struct radeon_winsys_bo ** gpu_timestamp_bo,uint32_t * gpu_timestamp_offset,void ** gpu_timestamp_ptr)185 radv_sqtt_acquire_gpu_timestamp(struct radv_device *device, struct radeon_winsys_bo **gpu_timestamp_bo,
186 uint32_t *gpu_timestamp_offset, void **gpu_timestamp_ptr)
187 {
188 simple_mtx_lock(&device->sqtt_timestamp_mtx);
189
190 if (device->sqtt_timestamp.offset + 8 > device->sqtt_timestamp.size) {
191 struct radeon_winsys_bo *bo;
192 uint64_t new_size;
193 VkResult result;
194 uint8_t *map;
195
196 new_size = MAX2(4096, 2 * device->sqtt_timestamp.size);
197
198 result = radv_bo_create(device, NULL, new_size, 8, RADEON_DOMAIN_GTT,
199 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_SCRATCH, 0,
200 true, &bo);
201 if (result != VK_SUCCESS) {
202 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
203 return result;
204 }
205
206 map = radv_buffer_map(device->ws, bo);
207 if (!map) {
208 radv_bo_destroy(device, NULL, bo);
209 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
210 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
211 }
212
213 if (device->sqtt_timestamp.bo) {
214 struct radv_sqtt_timestamp *new_timestamp;
215
216 new_timestamp = malloc(sizeof(*new_timestamp));
217 if (!new_timestamp) {
218 radv_bo_destroy(device, NULL, bo);
219 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
220 return VK_ERROR_OUT_OF_HOST_MEMORY;
221 }
222
223 memcpy(new_timestamp, &device->sqtt_timestamp, sizeof(*new_timestamp));
224 list_add(&new_timestamp->list, &device->sqtt_timestamp.list);
225 }
226
227 device->sqtt_timestamp.bo = bo;
228 device->sqtt_timestamp.size = new_size;
229 device->sqtt_timestamp.offset = 0;
230 device->sqtt_timestamp.map = map;
231 }
232
233 *gpu_timestamp_bo = device->sqtt_timestamp.bo;
234 *gpu_timestamp_offset = device->sqtt_timestamp.offset;
235 *gpu_timestamp_ptr = device->sqtt_timestamp.map + device->sqtt_timestamp.offset;
236
237 device->sqtt_timestamp.offset += 8;
238
239 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
240
241 return VK_SUCCESS;
242 }
243
244 static void
radv_sqtt_reset_timestamp(struct radv_device * device)245 radv_sqtt_reset_timestamp(struct radv_device *device)
246 {
247 simple_mtx_lock(&device->sqtt_timestamp_mtx);
248
249 list_for_each_entry_safe (struct radv_sqtt_timestamp, ts, &device->sqtt_timestamp.list, list) {
250 radv_bo_destroy(device, NULL, ts->bo);
251 list_del(&ts->list);
252 free(ts);
253 }
254
255 device->sqtt_timestamp.offset = 0;
256
257 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
258 }
259
260 static bool
radv_sqtt_init_queue_event(struct radv_device * device)261 radv_sqtt_init_queue_event(struct radv_device *device)
262 {
263 const struct radv_physical_device *pdev = radv_device_physical(device);
264 const struct radv_instance *instance = radv_physical_device_instance(pdev);
265 VkCommandPool cmd_pool;
266 VkResult result;
267
268 const VkCommandPoolCreateInfo create_gfx_info = {
269 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
270 .queueFamilyIndex = RADV_QUEUE_GENERAL, /* Graphics queue is always the first queue. */
271 };
272
273 result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_gfx_info, NULL, &cmd_pool);
274 if (result != VK_SUCCESS)
275 return false;
276
277 device->sqtt_command_pool[0] = vk_command_pool_from_handle(cmd_pool);
278
279 if (!(instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
280 const VkCommandPoolCreateInfo create_comp_info = {
281 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
282 .queueFamilyIndex = RADV_QUEUE_COMPUTE,
283 };
284
285 result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_comp_info, NULL, &cmd_pool);
286 if (result != VK_SUCCESS)
287 return false;
288
289 device->sqtt_command_pool[1] = vk_command_pool_from_handle(cmd_pool);
290 }
291
292 simple_mtx_init(&device->sqtt_command_pool_mtx, mtx_plain);
293
294 simple_mtx_init(&device->sqtt_timestamp_mtx, mtx_plain);
295 list_inithead(&device->sqtt_timestamp.list);
296
297 return true;
298 }
299
300 static void
radv_sqtt_finish_queue_event(struct radv_device * device)301 radv_sqtt_finish_queue_event(struct radv_device *device)
302 {
303 if (device->sqtt_timestamp.bo)
304 radv_bo_destroy(device, NULL, device->sqtt_timestamp.bo);
305
306 simple_mtx_destroy(&device->sqtt_timestamp_mtx);
307
308 for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++)
309 vk_common_DestroyCommandPool(radv_device_to_handle(device),
310 vk_command_pool_to_handle(device->sqtt_command_pool[i]), NULL);
311
312 simple_mtx_destroy(&device->sqtt_command_pool_mtx);
313 }
314
315 static bool
radv_sqtt_init_bo(struct radv_device * device)316 radv_sqtt_init_bo(struct radv_device *device)
317 {
318 const struct radv_physical_device *pdev = radv_device_physical(device);
319 const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(&pdev->info);
320 unsigned max_se = pdev->info.max_se;
321 struct radeon_winsys *ws = device->ws;
322 VkResult result;
323 uint64_t size;
324
325 /* The buffer size and address need to be aligned in HW regs. Align the
326 * size as early as possible so that we do all the allocation & addressing
327 * correctly. */
328 device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1ull << align_shift);
329
330 /* Compute total size of the thread trace BO for all SEs. */
331 size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << align_shift);
332 size += device->sqtt.buffer_size * (uint64_t)max_se;
333
334 struct radeon_winsys_bo *bo = NULL;
335 result = radv_bo_create(device, NULL, size, 4096, RADEON_DOMAIN_VRAM,
336 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
337 RADV_BO_PRIORITY_SCRATCH, 0, true, &bo);
338 device->sqtt.bo = bo;
339 if (result != VK_SUCCESS)
340 return false;
341
342 result = ws->buffer_make_resident(ws, device->sqtt.bo, true);
343 if (result != VK_SUCCESS)
344 return false;
345
346 device->sqtt.ptr = radv_buffer_map(ws, device->sqtt.bo);
347 if (!device->sqtt.ptr)
348 return false;
349
350 device->sqtt.buffer_va = radv_buffer_get_va(device->sqtt.bo);
351
352 return true;
353 }
354
355 static void
radv_sqtt_finish_bo(struct radv_device * device)356 radv_sqtt_finish_bo(struct radv_device *device)
357 {
358 struct radeon_winsys *ws = device->ws;
359
360 if (unlikely(device->sqtt.bo)) {
361 ws->buffer_make_resident(ws, device->sqtt.bo, false);
362 radv_bo_destroy(device, NULL, device->sqtt.bo);
363 }
364 }
365
366 static VkResult
radv_register_queue(struct radv_device * device,struct radv_queue * queue)367 radv_register_queue(struct radv_device *device, struct radv_queue *queue)
368 {
369 struct ac_sqtt *sqtt = &device->sqtt;
370 struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
371 struct rgp_queue_info_record *record;
372
373 record = malloc(sizeof(struct rgp_queue_info_record));
374 if (!record)
375 return VK_ERROR_OUT_OF_HOST_MEMORY;
376
377 record->queue_id = (uintptr_t)queue;
378 record->queue_context = (uintptr_t)queue->hw_ctx;
379 if (queue->vk.queue_family_index == RADV_QUEUE_GENERAL) {
380 record->hardware_info.queue_type = SQTT_QUEUE_TYPE_UNIVERSAL;
381 record->hardware_info.engine_type = SQTT_ENGINE_TYPE_UNIVERSAL;
382 } else {
383 record->hardware_info.queue_type = SQTT_QUEUE_TYPE_COMPUTE;
384 record->hardware_info.engine_type = SQTT_ENGINE_TYPE_COMPUTE;
385 }
386
387 simple_mtx_lock(&queue_info->lock);
388 list_addtail(&record->list, &queue_info->record);
389 queue_info->record_count++;
390 simple_mtx_unlock(&queue_info->lock);
391
392 return VK_SUCCESS;
393 }
394
395 static void
radv_unregister_queue(struct radv_device * device,struct radv_queue * queue)396 radv_unregister_queue(struct radv_device *device, struct radv_queue *queue)
397 {
398 struct ac_sqtt *sqtt = &device->sqtt;
399 struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
400
401 /* Destroy queue info record. */
402 simple_mtx_lock(&queue_info->lock);
403 if (queue_info->record_count > 0) {
404 list_for_each_entry_safe (struct rgp_queue_info_record, record, &queue_info->record, list) {
405 if (record->queue_id == (uintptr_t)queue) {
406 queue_info->record_count--;
407 list_del(&record->list);
408 free(record);
409 break;
410 }
411 }
412 }
413 simple_mtx_unlock(&queue_info->lock);
414 }
415
416 static void
radv_register_queues(struct radv_device * device,struct ac_sqtt * sqtt)417 radv_register_queues(struct radv_device *device, struct ac_sqtt *sqtt)
418 {
419 if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
420 radv_register_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
421
422 for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
423 radv_register_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
424 }
425
426 static void
radv_unregister_queues(struct radv_device * device,struct ac_sqtt * sqtt)427 radv_unregister_queues(struct radv_device *device, struct ac_sqtt *sqtt)
428 {
429 if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
430 radv_unregister_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
431
432 for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
433 radv_unregister_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
434 }
435
436 bool
radv_sqtt_init(struct radv_device * device)437 radv_sqtt_init(struct radv_device *device)
438 {
439 struct ac_sqtt *sqtt = &device->sqtt;
440
441 /* Default buffer size set to 32MB per SE. */
442 device->sqtt.buffer_size = (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024);
443 device->sqtt.instruction_timing_enabled = radv_is_instruction_timing_enabled();
444
445 if (!radv_sqtt_init_bo(device))
446 return false;
447
448 if (!radv_sqtt_init_queue_event(device))
449 return false;
450
451 if (!radv_device_acquire_performance_counters(device))
452 return false;
453
454 ac_sqtt_init(sqtt);
455
456 radv_register_queues(device, sqtt);
457
458 return true;
459 }
460
461 void
radv_sqtt_finish(struct radv_device * device)462 radv_sqtt_finish(struct radv_device *device)
463 {
464 struct ac_sqtt *sqtt = &device->sqtt;
465 struct radeon_winsys *ws = device->ws;
466
467 radv_sqtt_finish_bo(device);
468 radv_sqtt_finish_queue_event(device);
469
470 for (unsigned i = 0; i < 2; i++) {
471 if (device->sqtt.start_cs[i])
472 ws->cs_destroy(device->sqtt.start_cs[i]);
473 if (device->sqtt.stop_cs[i])
474 ws->cs_destroy(device->sqtt.stop_cs[i]);
475 }
476
477 radv_unregister_queues(device, sqtt);
478
479 ac_sqtt_finish(sqtt);
480 }
481
482 static bool
radv_sqtt_resize_bo(struct radv_device * device)483 radv_sqtt_resize_bo(struct radv_device *device)
484 {
485 /* Destroy the previous thread trace BO. */
486 radv_sqtt_finish_bo(device);
487
488 /* Double the size of the thread trace buffer per SE. */
489 device->sqtt.buffer_size *= 2;
490
491 fprintf(stderr,
492 "Failed to get the thread trace because the buffer "
493 "was too small, resizing to %d KB\n",
494 device->sqtt.buffer_size / 1024);
495
496 /* Re-create the thread trace BO. */
497 return radv_sqtt_init_bo(device);
498 }
499
500 static bool
radv_begin_sqtt(struct radv_queue * queue)501 radv_begin_sqtt(struct radv_queue *queue)
502 {
503 struct radv_device *device = radv_queue_device(queue);
504 const struct radv_physical_device *pdev = radv_device_physical(device);
505 enum radv_queue_family family = queue->state.qf;
506 struct radeon_winsys *ws = device->ws;
507 struct radeon_cmdbuf *cs;
508 VkResult result;
509
510 /* Destroy the previous start CS and create a new one. */
511 if (device->sqtt.start_cs[family]) {
512 ws->cs_destroy(device->sqtt.start_cs[family]);
513 device->sqtt.start_cs[family] = NULL;
514 }
515
516 cs = ws->cs_create(ws, radv_queue_ring(queue), false);
517 if (!cs)
518 return false;
519
520 radeon_check_space(ws, cs, 512);
521
522 switch (family) {
523 case RADV_QUEUE_GENERAL:
524 radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
525 radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
526 radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
527 break;
528 case RADV_QUEUE_COMPUTE:
529 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
530 radeon_emit(cs, 0);
531 break;
532 default:
533 unreachable("Incorrect queue family");
534 break;
535 }
536
537 /* Make sure to wait-for-idle before starting SQTT. */
538 radv_emit_wait_for_idle(device, cs, family);
539
540 /* Disable clock gating before starting SQTT. */
541 radv_emit_inhibit_clockgating(device, cs, true);
542
543 /* Enable SQG events that collects thread trace data. */
544 radv_emit_spi_config_cntl(device, cs, true);
545
546 radv_perfcounter_emit_spm_reset(cs);
547
548 if (device->spm.bo) {
549 /* Enable all shader stages by default. */
550 radv_perfcounter_emit_shaders(device, cs, ac_sqtt_get_shader_mask(&pdev->info));
551
552 radv_emit_spm_setup(device, cs, family);
553 }
554
555 /* Start SQTT. */
556 radv_emit_sqtt_start(device, cs, family);
557
558 if (device->spm.bo) {
559 radeon_check_space(ws, cs, 8);
560 radv_perfcounter_emit_spm_start(device, cs, family);
561 }
562
563 result = ws->cs_finalize(cs);
564 if (result != VK_SUCCESS) {
565 ws->cs_destroy(cs);
566 return false;
567 }
568
569 device->sqtt.start_cs[family] = cs;
570
571 return radv_queue_internal_submit(queue, cs);
572 }
573
574 static bool
radv_end_sqtt(struct radv_queue * queue)575 radv_end_sqtt(struct radv_queue *queue)
576 {
577 struct radv_device *device = radv_queue_device(queue);
578 enum radv_queue_family family = queue->state.qf;
579 struct radeon_winsys *ws = device->ws;
580 struct radeon_cmdbuf *cs;
581 VkResult result;
582
583 /* Destroy the previous stop CS and create a new one. */
584 if (device->sqtt.stop_cs[family]) {
585 ws->cs_destroy(device->sqtt.stop_cs[family]);
586 device->sqtt.stop_cs[family] = NULL;
587 }
588
589 cs = ws->cs_create(ws, radv_queue_ring(queue), false);
590 if (!cs)
591 return false;
592
593 radeon_check_space(ws, cs, 512);
594
595 switch (family) {
596 case RADV_QUEUE_GENERAL:
597 radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
598 radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
599 radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
600 break;
601 case RADV_QUEUE_COMPUTE:
602 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
603 radeon_emit(cs, 0);
604 break;
605 default:
606 unreachable("Incorrect queue family");
607 break;
608 }
609
610 /* Make sure to wait-for-idle before stopping SQTT. */
611 radv_emit_wait_for_idle(device, cs, family);
612
613 if (device->spm.bo) {
614 radeon_check_space(ws, cs, 8);
615 radv_perfcounter_emit_spm_stop(device, cs, family);
616 }
617
618 /* Stop SQTT. */
619 radv_emit_sqtt_stop(device, cs, family);
620
621 radv_perfcounter_emit_spm_reset(cs);
622
623 /* Restore previous state by disabling SQG events. */
624 radv_emit_spi_config_cntl(device, cs, false);
625
626 /* Restore previous state by re-enabling clock gating. */
627 radv_emit_inhibit_clockgating(device, cs, false);
628
629 result = ws->cs_finalize(cs);
630 if (result != VK_SUCCESS) {
631 ws->cs_destroy(cs);
632 return false;
633 }
634
635 device->sqtt.stop_cs[family] = cs;
636
637 return radv_queue_internal_submit(queue, cs);
638 }
639
640 void
radv_sqtt_start_capturing(struct radv_queue * queue)641 radv_sqtt_start_capturing(struct radv_queue *queue)
642 {
643 struct radv_device *device = radv_queue_device(queue);
644 const struct radv_physical_device *pdev = radv_device_physical(device);
645
646 if (ac_check_profile_state(&pdev->info)) {
647 fprintf(stderr, "radv: Canceling RGP trace request as a hang condition has been "
648 "detected. Force the GPU into a profiling mode with e.g. "
649 "\"echo profile_peak > "
650 "/sys/class/drm/card0/device/power_dpm_force_performance_level\"\n");
651 return;
652 }
653
654 /* Sample CPU/GPU clocks before starting the trace. */
655 if (!radv_sqtt_sample_clocks(device)) {
656 fprintf(stderr, "radv: Failed to sample clocks\n");
657 }
658
659 radv_begin_sqtt(queue);
660 assert(!device->sqtt_enabled);
661 device->sqtt_enabled = true;
662 }
663
664 bool
radv_sqtt_stop_capturing(struct radv_queue * queue)665 radv_sqtt_stop_capturing(struct radv_queue *queue)
666 {
667 struct radv_device *device = radv_queue_device(queue);
668 const struct radv_physical_device *pdev = radv_device_physical(device);
669 struct ac_sqtt_trace sqtt_trace = {0};
670 struct ac_spm_trace spm_trace;
671 bool captured = true;
672
673 radv_end_sqtt(queue);
674 device->sqtt_enabled = false;
675
676 /* TODO: Do something better than this whole sync. */
677 device->vk.dispatch_table.QueueWaitIdle(radv_queue_to_handle(queue));
678
679 if (radv_get_sqtt_trace(queue, &sqtt_trace) && (!device->spm.bo || radv_get_spm_trace(queue, &spm_trace))) {
680 ac_dump_rgp_capture(&pdev->info, &sqtt_trace, device->spm.bo ? &spm_trace : NULL);
681 } else {
682 /* Failed to capture because the buffer was too small. */
683 captured = false;
684 }
685
686 /* Clear resources used for this capture. */
687 radv_reset_sqtt_trace(device);
688
689 return captured;
690 }
691
692 bool
radv_get_sqtt_trace(struct radv_queue * queue,struct ac_sqtt_trace * sqtt_trace)693 radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_trace)
694 {
695 struct radv_device *device = radv_queue_device(queue);
696 const struct radv_physical_device *pdev = radv_device_physical(device);
697 const struct radeon_info *gpu_info = &pdev->info;
698
699 if (!ac_sqtt_get_trace(&device->sqtt, gpu_info, sqtt_trace)) {
700 if (!radv_sqtt_resize_bo(device))
701 fprintf(stderr, "radv: Failed to resize the SQTT buffer.\n");
702 return false;
703 }
704
705 return true;
706 }
707
708 void
radv_reset_sqtt_trace(struct radv_device * device)709 radv_reset_sqtt_trace(struct radv_device *device)
710 {
711 struct ac_sqtt *sqtt = &device->sqtt;
712 struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
713 struct rgp_queue_event *queue_event = &sqtt->rgp_queue_event;
714
715 /* Clear clock calibration records. */
716 simple_mtx_lock(&clock_calibration->lock);
717 list_for_each_entry_safe (struct rgp_clock_calibration_record, record, &clock_calibration->record, list) {
718 clock_calibration->record_count--;
719 list_del(&record->list);
720 free(record);
721 }
722 simple_mtx_unlock(&clock_calibration->lock);
723
724 /* Clear queue event records. */
725 simple_mtx_lock(&queue_event->lock);
726 list_for_each_entry_safe (struct rgp_queue_event_record, record, &queue_event->record, list) {
727 list_del(&record->list);
728 free(record);
729 }
730 queue_event->record_count = 0;
731 simple_mtx_unlock(&queue_event->lock);
732
733 /* Clear timestamps. */
734 radv_sqtt_reset_timestamp(device);
735
736 /* Clear timed cmdbufs. */
737 simple_mtx_lock(&device->sqtt_command_pool_mtx);
738 for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++) {
739 /* If RADV_DEBUG_NO_COMPUTE_QUEUE is used, there's no compute sqtt command pool */
740 if (device->sqtt_command_pool[i])
741 vk_common_TrimCommandPool(radv_device_to_handle(device), vk_command_pool_to_handle(device->sqtt_command_pool[i]),
742 0);
743 }
744 simple_mtx_unlock(&device->sqtt_command_pool_mtx);
745 }
746
747 static VkResult
radv_get_calibrated_timestamps(struct radv_device * device,uint64_t * cpu_timestamp,uint64_t * gpu_timestamp)748 radv_get_calibrated_timestamps(struct radv_device *device, uint64_t *cpu_timestamp, uint64_t *gpu_timestamp)
749 {
750 uint64_t timestamps[2];
751 uint64_t max_deviation;
752 VkResult result;
753
754 const VkCalibratedTimestampInfoKHR timestamp_infos[2] = {{
755 .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
756 .timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
757 },
758 {
759 .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
760 .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR,
761 }};
762
763 result = device->vk.dispatch_table.GetCalibratedTimestampsKHR(radv_device_to_handle(device), 2, timestamp_infos,
764 timestamps, &max_deviation);
765 if (result != VK_SUCCESS)
766 return result;
767
768 *cpu_timestamp = timestamps[0];
769 *gpu_timestamp = timestamps[1];
770
771 return result;
772 }
773
774 bool
radv_sqtt_sample_clocks(struct radv_device * device)775 radv_sqtt_sample_clocks(struct radv_device *device)
776 {
777 uint64_t cpu_timestamp = 0, gpu_timestamp = 0;
778 VkResult result;
779
780 result = radv_get_calibrated_timestamps(device, &cpu_timestamp, &gpu_timestamp);
781 if (result != VK_SUCCESS)
782 return false;
783
784 return ac_sqtt_add_clock_calibration(&device->sqtt, cpu_timestamp, gpu_timestamp);
785 }
786
787 VkResult
radv_sqtt_get_timed_cmdbuf(struct radv_queue * queue,struct radeon_winsys_bo * timestamp_bo,uint32_t timestamp_offset,VkPipelineStageFlags2 timestamp_stage,VkCommandBuffer * pcmdbuf)788 radv_sqtt_get_timed_cmdbuf(struct radv_queue *queue, struct radeon_winsys_bo *timestamp_bo, uint32_t timestamp_offset,
789 VkPipelineStageFlags2 timestamp_stage, VkCommandBuffer *pcmdbuf)
790 {
791 struct radv_device *device = radv_queue_device(queue);
792 enum radv_queue_family queue_family = queue->state.qf;
793 VkCommandBuffer cmdbuf;
794 uint64_t timestamp_va;
795 VkResult result;
796
797 assert(queue_family == RADV_QUEUE_GENERAL || queue_family == RADV_QUEUE_COMPUTE);
798
799 simple_mtx_lock(&device->sqtt_command_pool_mtx);
800
801 const VkCommandBufferAllocateInfo alloc_info = {
802 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
803 .commandPool = vk_command_pool_to_handle(device->sqtt_command_pool[queue_family]),
804 .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
805 .commandBufferCount = 1,
806 };
807
808 result = vk_common_AllocateCommandBuffers(radv_device_to_handle(device), &alloc_info, &cmdbuf);
809 if (result != VK_SUCCESS)
810 goto fail;
811
812 const VkCommandBufferBeginInfo begin_info = {
813 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
814 .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
815 };
816
817 result = radv_BeginCommandBuffer(cmdbuf, &begin_info);
818 if (result != VK_SUCCESS)
819 goto fail;
820
821 radeon_check_space(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, 28);
822
823 timestamp_va = radv_buffer_get_va(timestamp_bo) + timestamp_offset;
824
825 radv_cs_add_buffer(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, timestamp_bo);
826
827 radv_write_timestamp(radv_cmd_buffer_from_handle(cmdbuf), timestamp_va, timestamp_stage);
828
829 result = radv_EndCommandBuffer(cmdbuf);
830 if (result != VK_SUCCESS)
831 goto fail;
832
833 *pcmdbuf = cmdbuf;
834
835 fail:
836 simple_mtx_unlock(&device->sqtt_command_pool_mtx);
837 return result;
838 }
839