• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include <inttypes.h>
8 
9 #include "radv_buffer.h"
10 #include "radv_cs.h"
11 #include "radv_debug.h"
12 #include "radv_entrypoints.h"
13 #include "radv_perfcounter.h"
14 #include "radv_spm.h"
15 #include "radv_sqtt.h"
16 #include "sid.h"
17 
18 #include "ac_pm4.h"
19 
20 #include "vk_command_pool.h"
21 #include "vk_common_entrypoints.h"
22 
23 bool
radv_is_instruction_timing_enabled(void)24 radv_is_instruction_timing_enabled(void)
25 {
26    return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true);
27 }
28 
29 bool
radv_sqtt_queue_events_enabled(void)30 radv_sqtt_queue_events_enabled(void)
31 {
32    return debug_get_bool_option("RADV_THREAD_TRACE_QUEUE_EVENTS", true);
33 }
34 
35 static enum radv_queue_family
radv_ip_to_queue_family(enum amd_ip_type t)36 radv_ip_to_queue_family(enum amd_ip_type t)
37 {
38    switch (t) {
39    case AMD_IP_GFX:
40       return RADV_QUEUE_GENERAL;
41    case AMD_IP_COMPUTE:
42       return RADV_QUEUE_COMPUTE;
43    case AMD_IP_SDMA:
44       return RADV_QUEUE_TRANSFER;
45    default:
46       unreachable("Unknown IP type");
47    }
48 }
49 
50 static void
radv_emit_wait_for_idle(const struct radv_device * device,struct radeon_cmdbuf * cs,int family)51 radv_emit_wait_for_idle(const struct radv_device *device, struct radeon_cmdbuf *cs, int family)
52 {
53    const struct radv_physical_device *pdev = radv_device_physical(device);
54    const enum radv_queue_family qf = radv_ip_to_queue_family(family);
55    enum rgp_flush_bits sqtt_flush_bits = 0;
56    radv_cs_emit_cache_flush(
57       device->ws, cs, pdev->info.gfx_level, NULL, 0, qf,
58       (family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH
59                                     : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
60          RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2,
61       &sqtt_flush_bits, 0);
62 }
63 
64 static void
radv_emit_sqtt_start(const struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)65 radv_emit_sqtt_start(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
66 {
67    const struct radv_physical_device *pdev = radv_device_physical(device);
68    const bool is_compute_queue = qf == RADV_QUEUE_COMPUTE;
69    struct ac_pm4_state *pm4;
70 
71    pm4 = ac_pm4_create_sized(&pdev->info, false, 512, is_compute_queue);
72    if (!pm4)
73       return;
74 
75    ac_sqtt_emit_start(&pdev->info, pm4, &device->sqtt, is_compute_queue);
76    ac_pm4_finalize(pm4);
77 
78    radeon_check_space(device->ws, cs, pm4->ndw);
79    radeon_emit_array(cs, pm4->pm4, pm4->ndw);
80 
81    ac_pm4_free_state(pm4);
82 }
83 
84 static void
radv_emit_sqtt_stop(const struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)85 radv_emit_sqtt_stop(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
86 {
87    const struct radv_physical_device *pdev = radv_device_physical(device);
88    const bool is_compute_queue = qf == RADV_QUEUE_COMPUTE;
89    struct ac_pm4_state *pm4;
90 
91    pm4 = ac_pm4_create_sized(&pdev->info, false, 512, is_compute_queue);
92    if (!pm4)
93       return;
94 
95    ac_sqtt_emit_stop(&pdev->info, pm4, is_compute_queue);
96    ac_pm4_finalize(pm4);
97 
98    radeon_check_space(device->ws, cs, pm4->ndw);
99    radeon_emit_array(cs, pm4->pm4, pm4->ndw);
100 
101    ac_pm4_clear_state(pm4, &pdev->info, false, is_compute_queue);
102 
103    if (pdev->info.has_sqtt_rb_harvest_bug) {
104       /* Some chips with disabled RBs should wait for idle because FINISH_DONE doesn't work. */
105       radv_emit_wait_for_idle(device, cs, qf);
106    }
107 
108    ac_sqtt_emit_wait(&pdev->info, pm4, &device->sqtt, is_compute_queue);
109    ac_pm4_finalize(pm4);
110 
111    radeon_check_space(device->ws, cs, pm4->ndw);
112    radeon_emit_array(cs, pm4->pm4, pm4->ndw);
113 
114    ac_pm4_free_state(pm4);
115 }
116 
117 void
radv_emit_sqtt_userdata(const struct radv_cmd_buffer * cmd_buffer,const void * data,uint32_t num_dwords)118 radv_emit_sqtt_userdata(const struct radv_cmd_buffer *cmd_buffer, const void *data, uint32_t num_dwords)
119 {
120    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
121    const struct radv_physical_device *pdev = radv_device_physical(device);
122    const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
123    const enum radv_queue_family qf = cmd_buffer->qf;
124    struct radeon_cmdbuf *cs = cmd_buffer->cs;
125    const uint32_t *dwords = (uint32_t *)data;
126 
127    /* SQTT user data packets aren't supported on SDMA queues. */
128    if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
129       return;
130 
131    while (num_dwords > 0) {
132       uint32_t count = MIN2(num_dwords, 2);
133 
134       radeon_check_space(device->ws, cs, 2 + count);
135 
136       /* Without the perfctr bit the CP might not always pass the
137        * write on correctly. */
138       if (pdev->info.gfx_level >= GFX10)
139          radeon_set_uconfig_perfctr_reg_seq(gfx_level, qf, cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
140       else
141          radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
142       radeon_emit_array(cs, dwords, count);
143 
144       dwords += count;
145       num_dwords -= count;
146    }
147 }
148 
149 void
radv_emit_spi_config_cntl(const struct radv_device * device,struct radeon_cmdbuf * cs,bool enable)150 radv_emit_spi_config_cntl(const struct radv_device *device, struct radeon_cmdbuf *cs, bool enable)
151 {
152    const struct radv_physical_device *pdev = radv_device_physical(device);
153 
154    if (pdev->info.gfx_level >= GFX9) {
155       uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) |
156                                  S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable);
157 
158       if (pdev->info.gfx_level >= GFX10)
159          spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
160 
161       radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
162    } else {
163       /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
164       radeon_set_privileged_config_reg(cs, R_009100_SPI_CONFIG_CNTL,
165                                        S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable));
166    }
167 }
168 
169 void
radv_emit_inhibit_clockgating(const struct radv_device * device,struct radeon_cmdbuf * cs,bool inhibit)170 radv_emit_inhibit_clockgating(const struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit)
171 {
172    const struct radv_physical_device *pdev = radv_device_physical(device);
173 
174    if (pdev->info.gfx_level >= GFX11)
175       return; /* not needed */
176 
177    if (pdev->info.gfx_level >= GFX10) {
178       radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL, S_037390_PERFMON_CLOCK_STATE(inhibit));
179    } else if (pdev->info.gfx_level >= GFX8) {
180       radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL, S_0372FC_PERFMON_CLOCK_STATE(inhibit));
181    }
182 }
183 
184 VkResult
radv_sqtt_acquire_gpu_timestamp(struct radv_device * device,struct radeon_winsys_bo ** gpu_timestamp_bo,uint32_t * gpu_timestamp_offset,void ** gpu_timestamp_ptr)185 radv_sqtt_acquire_gpu_timestamp(struct radv_device *device, struct radeon_winsys_bo **gpu_timestamp_bo,
186                                 uint32_t *gpu_timestamp_offset, void **gpu_timestamp_ptr)
187 {
188    simple_mtx_lock(&device->sqtt_timestamp_mtx);
189 
190    if (device->sqtt_timestamp.offset + 8 > device->sqtt_timestamp.size) {
191       struct radeon_winsys_bo *bo;
192       uint64_t new_size;
193       VkResult result;
194       uint8_t *map;
195 
196       new_size = MAX2(4096, 2 * device->sqtt_timestamp.size);
197 
198       result = radv_bo_create(device, NULL, new_size, 8, RADEON_DOMAIN_GTT,
199                               RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_SCRATCH, 0,
200                               true, &bo);
201       if (result != VK_SUCCESS) {
202          simple_mtx_unlock(&device->sqtt_timestamp_mtx);
203          return result;
204       }
205 
206       map = radv_buffer_map(device->ws, bo);
207       if (!map) {
208          radv_bo_destroy(device, NULL, bo);
209          simple_mtx_unlock(&device->sqtt_timestamp_mtx);
210          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
211       }
212 
213       if (device->sqtt_timestamp.bo) {
214          struct radv_sqtt_timestamp *new_timestamp;
215 
216          new_timestamp = malloc(sizeof(*new_timestamp));
217          if (!new_timestamp) {
218             radv_bo_destroy(device, NULL, bo);
219             simple_mtx_unlock(&device->sqtt_timestamp_mtx);
220             return VK_ERROR_OUT_OF_HOST_MEMORY;
221          }
222 
223          memcpy(new_timestamp, &device->sqtt_timestamp, sizeof(*new_timestamp));
224          list_add(&new_timestamp->list, &device->sqtt_timestamp.list);
225       }
226 
227       device->sqtt_timestamp.bo = bo;
228       device->sqtt_timestamp.size = new_size;
229       device->sqtt_timestamp.offset = 0;
230       device->sqtt_timestamp.map = map;
231    }
232 
233    *gpu_timestamp_bo = device->sqtt_timestamp.bo;
234    *gpu_timestamp_offset = device->sqtt_timestamp.offset;
235    *gpu_timestamp_ptr = device->sqtt_timestamp.map + device->sqtt_timestamp.offset;
236 
237    device->sqtt_timestamp.offset += 8;
238 
239    simple_mtx_unlock(&device->sqtt_timestamp_mtx);
240 
241    return VK_SUCCESS;
242 }
243 
244 static void
radv_sqtt_reset_timestamp(struct radv_device * device)245 radv_sqtt_reset_timestamp(struct radv_device *device)
246 {
247    simple_mtx_lock(&device->sqtt_timestamp_mtx);
248 
249    list_for_each_entry_safe (struct radv_sqtt_timestamp, ts, &device->sqtt_timestamp.list, list) {
250       radv_bo_destroy(device, NULL, ts->bo);
251       list_del(&ts->list);
252       free(ts);
253    }
254 
255    device->sqtt_timestamp.offset = 0;
256 
257    simple_mtx_unlock(&device->sqtt_timestamp_mtx);
258 }
259 
260 static bool
radv_sqtt_init_queue_event(struct radv_device * device)261 radv_sqtt_init_queue_event(struct radv_device *device)
262 {
263    const struct radv_physical_device *pdev = radv_device_physical(device);
264    const struct radv_instance *instance = radv_physical_device_instance(pdev);
265    VkCommandPool cmd_pool;
266    VkResult result;
267 
268    const VkCommandPoolCreateInfo create_gfx_info = {
269       .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
270       .queueFamilyIndex = RADV_QUEUE_GENERAL, /* Graphics queue is always the first queue. */
271    };
272 
273    result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_gfx_info, NULL, &cmd_pool);
274    if (result != VK_SUCCESS)
275       return false;
276 
277    device->sqtt_command_pool[0] = vk_command_pool_from_handle(cmd_pool);
278 
279    if (!(instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
280       const VkCommandPoolCreateInfo create_comp_info = {
281          .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
282          .queueFamilyIndex = RADV_QUEUE_COMPUTE,
283       };
284 
285       result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_comp_info, NULL, &cmd_pool);
286       if (result != VK_SUCCESS)
287          return false;
288 
289       device->sqtt_command_pool[1] = vk_command_pool_from_handle(cmd_pool);
290    }
291 
292    simple_mtx_init(&device->sqtt_command_pool_mtx, mtx_plain);
293 
294    simple_mtx_init(&device->sqtt_timestamp_mtx, mtx_plain);
295    list_inithead(&device->sqtt_timestamp.list);
296 
297    return true;
298 }
299 
300 static void
radv_sqtt_finish_queue_event(struct radv_device * device)301 radv_sqtt_finish_queue_event(struct radv_device *device)
302 {
303    if (device->sqtt_timestamp.bo)
304       radv_bo_destroy(device, NULL, device->sqtt_timestamp.bo);
305 
306    simple_mtx_destroy(&device->sqtt_timestamp_mtx);
307 
308    for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++)
309       vk_common_DestroyCommandPool(radv_device_to_handle(device),
310                                    vk_command_pool_to_handle(device->sqtt_command_pool[i]), NULL);
311 
312    simple_mtx_destroy(&device->sqtt_command_pool_mtx);
313 }
314 
315 static bool
radv_sqtt_init_bo(struct radv_device * device)316 radv_sqtt_init_bo(struct radv_device *device)
317 {
318    const struct radv_physical_device *pdev = radv_device_physical(device);
319    const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(&pdev->info);
320    unsigned max_se = pdev->info.max_se;
321    struct radeon_winsys *ws = device->ws;
322    VkResult result;
323    uint64_t size;
324 
325    /* The buffer size and address need to be aligned in HW regs. Align the
326     * size as early as possible so that we do all the allocation & addressing
327     * correctly. */
328    device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1ull << align_shift);
329 
330    /* Compute total size of the thread trace BO for all SEs. */
331    size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1ull << align_shift);
332    size += device->sqtt.buffer_size * (uint64_t)max_se;
333 
334    struct radeon_winsys_bo *bo = NULL;
335    result = radv_bo_create(device, NULL, size, 4096, RADEON_DOMAIN_VRAM,
336                            RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
337                            RADV_BO_PRIORITY_SCRATCH, 0, true, &bo);
338    device->sqtt.bo = bo;
339    if (result != VK_SUCCESS)
340       return false;
341 
342    result = ws->buffer_make_resident(ws, device->sqtt.bo, true);
343    if (result != VK_SUCCESS)
344       return false;
345 
346    device->sqtt.ptr = radv_buffer_map(ws, device->sqtt.bo);
347    if (!device->sqtt.ptr)
348       return false;
349 
350    device->sqtt.buffer_va = radv_buffer_get_va(device->sqtt.bo);
351 
352    return true;
353 }
354 
355 static void
radv_sqtt_finish_bo(struct radv_device * device)356 radv_sqtt_finish_bo(struct radv_device *device)
357 {
358    struct radeon_winsys *ws = device->ws;
359 
360    if (unlikely(device->sqtt.bo)) {
361       ws->buffer_make_resident(ws, device->sqtt.bo, false);
362       radv_bo_destroy(device, NULL, device->sqtt.bo);
363    }
364 }
365 
366 static VkResult
radv_register_queue(struct radv_device * device,struct radv_queue * queue)367 radv_register_queue(struct radv_device *device, struct radv_queue *queue)
368 {
369    struct ac_sqtt *sqtt = &device->sqtt;
370    struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
371    struct rgp_queue_info_record *record;
372 
373    record = malloc(sizeof(struct rgp_queue_info_record));
374    if (!record)
375       return VK_ERROR_OUT_OF_HOST_MEMORY;
376 
377    record->queue_id = (uintptr_t)queue;
378    record->queue_context = (uintptr_t)queue->hw_ctx;
379    if (queue->vk.queue_family_index == RADV_QUEUE_GENERAL) {
380       record->hardware_info.queue_type = SQTT_QUEUE_TYPE_UNIVERSAL;
381       record->hardware_info.engine_type = SQTT_ENGINE_TYPE_UNIVERSAL;
382    } else {
383       record->hardware_info.queue_type = SQTT_QUEUE_TYPE_COMPUTE;
384       record->hardware_info.engine_type = SQTT_ENGINE_TYPE_COMPUTE;
385    }
386 
387    simple_mtx_lock(&queue_info->lock);
388    list_addtail(&record->list, &queue_info->record);
389    queue_info->record_count++;
390    simple_mtx_unlock(&queue_info->lock);
391 
392    return VK_SUCCESS;
393 }
394 
395 static void
radv_unregister_queue(struct radv_device * device,struct radv_queue * queue)396 radv_unregister_queue(struct radv_device *device, struct radv_queue *queue)
397 {
398    struct ac_sqtt *sqtt = &device->sqtt;
399    struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
400 
401    /* Destroy queue info record. */
402    simple_mtx_lock(&queue_info->lock);
403    if (queue_info->record_count > 0) {
404       list_for_each_entry_safe (struct rgp_queue_info_record, record, &queue_info->record, list) {
405          if (record->queue_id == (uintptr_t)queue) {
406             queue_info->record_count--;
407             list_del(&record->list);
408             free(record);
409             break;
410          }
411       }
412    }
413    simple_mtx_unlock(&queue_info->lock);
414 }
415 
416 static void
radv_register_queues(struct radv_device * device,struct ac_sqtt * sqtt)417 radv_register_queues(struct radv_device *device, struct ac_sqtt *sqtt)
418 {
419    if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
420       radv_register_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
421 
422    for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
423       radv_register_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
424 }
425 
426 static void
radv_unregister_queues(struct radv_device * device,struct ac_sqtt * sqtt)427 radv_unregister_queues(struct radv_device *device, struct ac_sqtt *sqtt)
428 {
429    if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
430       radv_unregister_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
431 
432    for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
433       radv_unregister_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
434 }
435 
436 bool
radv_sqtt_init(struct radv_device * device)437 radv_sqtt_init(struct radv_device *device)
438 {
439    struct ac_sqtt *sqtt = &device->sqtt;
440 
441    /* Default buffer size set to 32MB per SE. */
442    device->sqtt.buffer_size = (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024);
443    device->sqtt.instruction_timing_enabled = radv_is_instruction_timing_enabled();
444 
445    if (!radv_sqtt_init_bo(device))
446       return false;
447 
448    if (!radv_sqtt_init_queue_event(device))
449       return false;
450 
451    if (!radv_device_acquire_performance_counters(device))
452       return false;
453 
454    ac_sqtt_init(sqtt);
455 
456    radv_register_queues(device, sqtt);
457 
458    return true;
459 }
460 
461 void
radv_sqtt_finish(struct radv_device * device)462 radv_sqtt_finish(struct radv_device *device)
463 {
464    struct ac_sqtt *sqtt = &device->sqtt;
465    struct radeon_winsys *ws = device->ws;
466 
467    radv_sqtt_finish_bo(device);
468    radv_sqtt_finish_queue_event(device);
469 
470    for (unsigned i = 0; i < 2; i++) {
471       if (device->sqtt.start_cs[i])
472          ws->cs_destroy(device->sqtt.start_cs[i]);
473       if (device->sqtt.stop_cs[i])
474          ws->cs_destroy(device->sqtt.stop_cs[i]);
475    }
476 
477    radv_unregister_queues(device, sqtt);
478 
479    ac_sqtt_finish(sqtt);
480 }
481 
482 static bool
radv_sqtt_resize_bo(struct radv_device * device)483 radv_sqtt_resize_bo(struct radv_device *device)
484 {
485    /* Destroy the previous thread trace BO. */
486    radv_sqtt_finish_bo(device);
487 
488    /* Double the size of the thread trace buffer per SE. */
489    device->sqtt.buffer_size *= 2;
490 
491    fprintf(stderr,
492            "Failed to get the thread trace because the buffer "
493            "was too small, resizing to %d KB\n",
494            device->sqtt.buffer_size / 1024);
495 
496    /* Re-create the thread trace BO. */
497    return radv_sqtt_init_bo(device);
498 }
499 
500 static bool
radv_begin_sqtt(struct radv_queue * queue)501 radv_begin_sqtt(struct radv_queue *queue)
502 {
503    struct radv_device *device = radv_queue_device(queue);
504    const struct radv_physical_device *pdev = radv_device_physical(device);
505    enum radv_queue_family family = queue->state.qf;
506    struct radeon_winsys *ws = device->ws;
507    struct radeon_cmdbuf *cs;
508    VkResult result;
509 
510    /* Destroy the previous start CS and create a new one. */
511    if (device->sqtt.start_cs[family]) {
512       ws->cs_destroy(device->sqtt.start_cs[family]);
513       device->sqtt.start_cs[family] = NULL;
514    }
515 
516    cs = ws->cs_create(ws, radv_queue_ring(queue), false);
517    if (!cs)
518       return false;
519 
520    radeon_check_space(ws, cs, 512);
521 
522    switch (family) {
523    case RADV_QUEUE_GENERAL:
524       radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
525       radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
526       radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
527       break;
528    case RADV_QUEUE_COMPUTE:
529       radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
530       radeon_emit(cs, 0);
531       break;
532    default:
533       unreachable("Incorrect queue family");
534       break;
535    }
536 
537    /* Make sure to wait-for-idle before starting SQTT. */
538    radv_emit_wait_for_idle(device, cs, family);
539 
540    /* Disable clock gating before starting SQTT. */
541    radv_emit_inhibit_clockgating(device, cs, true);
542 
543    /* Enable SQG events that collects thread trace data. */
544    radv_emit_spi_config_cntl(device, cs, true);
545 
546    radv_perfcounter_emit_spm_reset(cs);
547 
548    if (device->spm.bo) {
549       /* Enable all shader stages by default. */
550       radv_perfcounter_emit_shaders(device, cs, ac_sqtt_get_shader_mask(&pdev->info));
551 
552       radv_emit_spm_setup(device, cs, family);
553    }
554 
555    /* Start SQTT. */
556    radv_emit_sqtt_start(device, cs, family);
557 
558    if (device->spm.bo) {
559       radeon_check_space(ws, cs, 8);
560       radv_perfcounter_emit_spm_start(device, cs, family);
561    }
562 
563    result = ws->cs_finalize(cs);
564    if (result != VK_SUCCESS) {
565       ws->cs_destroy(cs);
566       return false;
567    }
568 
569    device->sqtt.start_cs[family] = cs;
570 
571    return radv_queue_internal_submit(queue, cs);
572 }
573 
574 static bool
radv_end_sqtt(struct radv_queue * queue)575 radv_end_sqtt(struct radv_queue *queue)
576 {
577    struct radv_device *device = radv_queue_device(queue);
578    enum radv_queue_family family = queue->state.qf;
579    struct radeon_winsys *ws = device->ws;
580    struct radeon_cmdbuf *cs;
581    VkResult result;
582 
583    /* Destroy the previous stop CS and create a new one. */
584    if (device->sqtt.stop_cs[family]) {
585       ws->cs_destroy(device->sqtt.stop_cs[family]);
586       device->sqtt.stop_cs[family] = NULL;
587    }
588 
589    cs = ws->cs_create(ws, radv_queue_ring(queue), false);
590    if (!cs)
591       return false;
592 
593    radeon_check_space(ws, cs, 512);
594 
595    switch (family) {
596    case RADV_QUEUE_GENERAL:
597       radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
598       radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
599       radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
600       break;
601    case RADV_QUEUE_COMPUTE:
602       radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
603       radeon_emit(cs, 0);
604       break;
605    default:
606       unreachable("Incorrect queue family");
607       break;
608    }
609 
610    /* Make sure to wait-for-idle before stopping SQTT. */
611    radv_emit_wait_for_idle(device, cs, family);
612 
613    if (device->spm.bo) {
614       radeon_check_space(ws, cs, 8);
615       radv_perfcounter_emit_spm_stop(device, cs, family);
616    }
617 
618    /* Stop SQTT. */
619    radv_emit_sqtt_stop(device, cs, family);
620 
621    radv_perfcounter_emit_spm_reset(cs);
622 
623    /* Restore previous state by disabling SQG events. */
624    radv_emit_spi_config_cntl(device, cs, false);
625 
626    /* Restore previous state by re-enabling clock gating. */
627    radv_emit_inhibit_clockgating(device, cs, false);
628 
629    result = ws->cs_finalize(cs);
630    if (result != VK_SUCCESS) {
631       ws->cs_destroy(cs);
632       return false;
633    }
634 
635    device->sqtt.stop_cs[family] = cs;
636 
637    return radv_queue_internal_submit(queue, cs);
638 }
639 
640 void
radv_sqtt_start_capturing(struct radv_queue * queue)641 radv_sqtt_start_capturing(struct radv_queue *queue)
642 {
643    struct radv_device *device = radv_queue_device(queue);
644    const struct radv_physical_device *pdev = radv_device_physical(device);
645 
646    if (ac_check_profile_state(&pdev->info)) {
647       fprintf(stderr, "radv: Canceling RGP trace request as a hang condition has been "
648                       "detected. Force the GPU into a profiling mode with e.g. "
649                       "\"echo profile_peak  > "
650                       "/sys/class/drm/card0/device/power_dpm_force_performance_level\"\n");
651       return;
652    }
653 
654    /* Sample CPU/GPU clocks before starting the trace. */
655    if (!radv_sqtt_sample_clocks(device)) {
656       fprintf(stderr, "radv: Failed to sample clocks\n");
657    }
658 
659    radv_begin_sqtt(queue);
660    assert(!device->sqtt_enabled);
661    device->sqtt_enabled = true;
662 }
663 
664 bool
radv_sqtt_stop_capturing(struct radv_queue * queue)665 radv_sqtt_stop_capturing(struct radv_queue *queue)
666 {
667    struct radv_device *device = radv_queue_device(queue);
668    const struct radv_physical_device *pdev = radv_device_physical(device);
669    struct ac_sqtt_trace sqtt_trace = {0};
670    struct ac_spm_trace spm_trace;
671    bool captured = true;
672 
673    radv_end_sqtt(queue);
674    device->sqtt_enabled = false;
675 
676    /* TODO: Do something better than this whole sync. */
677    device->vk.dispatch_table.QueueWaitIdle(radv_queue_to_handle(queue));
678 
679    if (radv_get_sqtt_trace(queue, &sqtt_trace) && (!device->spm.bo || radv_get_spm_trace(queue, &spm_trace))) {
680       ac_dump_rgp_capture(&pdev->info, &sqtt_trace, device->spm.bo ? &spm_trace : NULL);
681    } else {
682       /* Failed to capture because the buffer was too small. */
683       captured = false;
684    }
685 
686    /* Clear resources used for this capture. */
687    radv_reset_sqtt_trace(device);
688 
689    return captured;
690 }
691 
692 bool
radv_get_sqtt_trace(struct radv_queue * queue,struct ac_sqtt_trace * sqtt_trace)693 radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_trace)
694 {
695    struct radv_device *device = radv_queue_device(queue);
696    const struct radv_physical_device *pdev = radv_device_physical(device);
697    const struct radeon_info *gpu_info = &pdev->info;
698 
699    if (!ac_sqtt_get_trace(&device->sqtt, gpu_info, sqtt_trace)) {
700       if (!radv_sqtt_resize_bo(device))
701          fprintf(stderr, "radv: Failed to resize the SQTT buffer.\n");
702       return false;
703    }
704 
705    return true;
706 }
707 
708 void
radv_reset_sqtt_trace(struct radv_device * device)709 radv_reset_sqtt_trace(struct radv_device *device)
710 {
711    struct ac_sqtt *sqtt = &device->sqtt;
712    struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
713    struct rgp_queue_event *queue_event = &sqtt->rgp_queue_event;
714 
715    /* Clear clock calibration records. */
716    simple_mtx_lock(&clock_calibration->lock);
717    list_for_each_entry_safe (struct rgp_clock_calibration_record, record, &clock_calibration->record, list) {
718       clock_calibration->record_count--;
719       list_del(&record->list);
720       free(record);
721    }
722    simple_mtx_unlock(&clock_calibration->lock);
723 
724    /* Clear queue event records. */
725    simple_mtx_lock(&queue_event->lock);
726    list_for_each_entry_safe (struct rgp_queue_event_record, record, &queue_event->record, list) {
727       list_del(&record->list);
728       free(record);
729    }
730    queue_event->record_count = 0;
731    simple_mtx_unlock(&queue_event->lock);
732 
733    /* Clear timestamps. */
734    radv_sqtt_reset_timestamp(device);
735 
736    /* Clear timed cmdbufs. */
737    simple_mtx_lock(&device->sqtt_command_pool_mtx);
738    for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++) {
739       /* If RADV_DEBUG_NO_COMPUTE_QUEUE is used, there's no compute sqtt command pool */
740       if (device->sqtt_command_pool[i])
741          vk_common_TrimCommandPool(radv_device_to_handle(device), vk_command_pool_to_handle(device->sqtt_command_pool[i]),
742                                 0);
743    }
744    simple_mtx_unlock(&device->sqtt_command_pool_mtx);
745 }
746 
747 static VkResult
radv_get_calibrated_timestamps(struct radv_device * device,uint64_t * cpu_timestamp,uint64_t * gpu_timestamp)748 radv_get_calibrated_timestamps(struct radv_device *device, uint64_t *cpu_timestamp, uint64_t *gpu_timestamp)
749 {
750    uint64_t timestamps[2];
751    uint64_t max_deviation;
752    VkResult result;
753 
754    const VkCalibratedTimestampInfoKHR timestamp_infos[2] = {{
755                                                                .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
756                                                                .timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
757                                                             },
758                                                             {
759                                                                .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
760                                                                .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR,
761                                                             }};
762 
763    result = device->vk.dispatch_table.GetCalibratedTimestampsKHR(radv_device_to_handle(device), 2, timestamp_infos,
764                                                                  timestamps, &max_deviation);
765    if (result != VK_SUCCESS)
766       return result;
767 
768    *cpu_timestamp = timestamps[0];
769    *gpu_timestamp = timestamps[1];
770 
771    return result;
772 }
773 
774 bool
radv_sqtt_sample_clocks(struct radv_device * device)775 radv_sqtt_sample_clocks(struct radv_device *device)
776 {
777    uint64_t cpu_timestamp = 0, gpu_timestamp = 0;
778    VkResult result;
779 
780    result = radv_get_calibrated_timestamps(device, &cpu_timestamp, &gpu_timestamp);
781    if (result != VK_SUCCESS)
782       return false;
783 
784    return ac_sqtt_add_clock_calibration(&device->sqtt, cpu_timestamp, gpu_timestamp);
785 }
786 
787 VkResult
radv_sqtt_get_timed_cmdbuf(struct radv_queue * queue,struct radeon_winsys_bo * timestamp_bo,uint32_t timestamp_offset,VkPipelineStageFlags2 timestamp_stage,VkCommandBuffer * pcmdbuf)788 radv_sqtt_get_timed_cmdbuf(struct radv_queue *queue, struct radeon_winsys_bo *timestamp_bo, uint32_t timestamp_offset,
789                            VkPipelineStageFlags2 timestamp_stage, VkCommandBuffer *pcmdbuf)
790 {
791    struct radv_device *device = radv_queue_device(queue);
792    enum radv_queue_family queue_family = queue->state.qf;
793    VkCommandBuffer cmdbuf;
794    uint64_t timestamp_va;
795    VkResult result;
796 
797    assert(queue_family == RADV_QUEUE_GENERAL || queue_family == RADV_QUEUE_COMPUTE);
798 
799    simple_mtx_lock(&device->sqtt_command_pool_mtx);
800 
801    const VkCommandBufferAllocateInfo alloc_info = {
802       .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
803       .commandPool = vk_command_pool_to_handle(device->sqtt_command_pool[queue_family]),
804       .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
805       .commandBufferCount = 1,
806    };
807 
808    result = vk_common_AllocateCommandBuffers(radv_device_to_handle(device), &alloc_info, &cmdbuf);
809    if (result != VK_SUCCESS)
810       goto fail;
811 
812    const VkCommandBufferBeginInfo begin_info = {
813       .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
814       .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
815    };
816 
817    result = radv_BeginCommandBuffer(cmdbuf, &begin_info);
818    if (result != VK_SUCCESS)
819       goto fail;
820 
821    radeon_check_space(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, 28);
822 
823    timestamp_va = radv_buffer_get_va(timestamp_bo) + timestamp_offset;
824 
825    radv_cs_add_buffer(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, timestamp_bo);
826 
827    radv_write_timestamp(radv_cmd_buffer_from_handle(cmdbuf), timestamp_va, timestamp_stage);
828 
829    result = radv_EndCommandBuffer(cmdbuf);
830    if (result != VK_SUCCESS)
831       goto fail;
832 
833    *pcmdbuf = cmdbuf;
834 
835 fail:
836    simple_mtx_unlock(&device->sqtt_command_pool_mtx);
837    return result;
838 }
839