• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 
26 #include "radv_cs.h"
27 #include "radv_debug.h"
28 #include "radv_private.h"
29 #include "sid.h"
30 
31 #include "vk_common_entrypoints.h"
32 
33 #define SQTT_BUFFER_ALIGN_SHIFT 12
34 
35 bool
radv_is_instruction_timing_enabled(void)36 radv_is_instruction_timing_enabled(void)
37 {
38    return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true);
39 }
40 
41 bool
radv_sqtt_queue_events_enabled(void)42 radv_sqtt_queue_events_enabled(void)
43 {
44    return debug_get_bool_option("RADV_THREAD_TRACE_QUEUE_EVENTS", true);
45 }
46 
47 static uint32_t
gfx11_get_sqtt_ctrl(const struct radv_device * device,bool enable)48 gfx11_get_sqtt_ctrl(const struct radv_device *device, bool enable)
49 {
50    return S_0367B0_MODE(enable) | S_0367B0_HIWATER(5) | S_0367B0_UTIL_TIMER(1) | S_0367B0_RT_FREQ(2) | /* 4096 clk */
51           S_0367B0_DRAW_EVENT_EN(1) | S_0367B0_SPI_STALL_EN(1) | S_0367B0_SQ_STALL_EN(1) | S_0367B0_REG_AT_HWM(2);
52 }
53 
54 static uint32_t
gfx10_get_sqtt_ctrl(const struct radv_device * device,bool enable)55 gfx10_get_sqtt_ctrl(const struct radv_device *device, bool enable)
56 {
57    uint32_t sqtt_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) |
58                         S_008D1C_RT_FREQ(2) | /* 4096 clk */
59                         S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) |
60                         S_008D1C_SQ_STALL_EN(1) | S_008D1C_REG_DROP_ON_STALL(0);
61 
62    if (device->physical_device->rad_info.gfx_level == GFX10_3)
63       sqtt_ctrl |= S_008D1C_LOWATER_OFFSET(4);
64 
65    if (device->physical_device->rad_info.has_sqtt_auto_flush_mode_bug)
66       sqtt_ctrl |= S_008D1C_AUTO_FLUSH_MODE(1);
67 
68    return sqtt_ctrl;
69 }
70 
71 static enum radv_queue_family
radv_ip_to_queue_family(enum amd_ip_type t)72 radv_ip_to_queue_family(enum amd_ip_type t)
73 {
74    switch (t) {
75    case AMD_IP_GFX:
76       return RADV_QUEUE_GENERAL;
77    case AMD_IP_COMPUTE:
78       return RADV_QUEUE_COMPUTE;
79    case AMD_IP_SDMA:
80       return RADV_QUEUE_TRANSFER;
81    default:
82       unreachable("Unknown IP type");
83    }
84 }
85 
86 static void
radv_emit_wait_for_idle(const struct radv_device * device,struct radeon_cmdbuf * cs,int family)87 radv_emit_wait_for_idle(const struct radv_device *device, struct radeon_cmdbuf *cs, int family)
88 {
89    const enum radv_queue_family qf = radv_ip_to_queue_family(family);
90    enum rgp_flush_bits sqtt_flush_bits = 0;
91    radv_cs_emit_cache_flush(
92       device->ws, cs, device->physical_device->rad_info.gfx_level, NULL, 0, qf,
93       (family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH
94                                     : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
95          RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2,
96       &sqtt_flush_bits, 0);
97 }
98 
99 static void
radv_emit_sqtt_start(const struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)100 radv_emit_sqtt_start(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
101 {
102    const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
103    uint32_t shifted_size = device->sqtt.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
104    const struct radeon_info *rad_info = &device->physical_device->rad_info;
105    const unsigned shader_mask = ac_sqtt_get_shader_mask(rad_info);
106    unsigned max_se = rad_info->max_se;
107 
108    radeon_check_space(device->ws, cs, 6 + max_se * 33);
109 
110    for (unsigned se = 0; se < max_se; se++) {
111       uint64_t va = radv_buffer_get_va(device->sqtt.bo);
112       uint64_t data_va = ac_sqtt_get_data_va(rad_info, &device->sqtt, va, se);
113       uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
114       int active_cu = ac_sqtt_get_active_cu(&device->physical_device->rad_info, se);
115 
116       if (ac_sqtt_se_is_disabled(rad_info, se))
117          continue;
118 
119       /* Target SEx and SH0. */
120       radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
121                              S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
122 
123       if (device->physical_device->rad_info.gfx_level >= GFX11) {
124          /* Order seems important for the following 2 registers. */
125          radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE,
126                                 S_0367A4_SIZE(shifted_size) | S_0367A4_BASE_HI(shifted_va >> 32));
127 
128          radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
129 
130          radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B4_SQ_THREAD_TRACE_MASK,
131                                 S_0367B4_WTYPE_INCLUDE(shader_mask) | S_0367B4_SA_SEL(0) |
132                                    S_0367B4_WGP_SEL(active_cu / 2) | S_0367B4_SIMD_SEL(0));
133 
134          uint32_t sqtt_token_mask = S_0367B8_REG_INCLUDE(V_0367B8_REG_INCLUDE_SQDEC | V_0367B8_REG_INCLUDE_SHDEC |
135                                                          V_0367B8_REG_INCLUDE_GFXUDEC | V_0367B8_REG_INCLUDE_COMP |
136                                                          V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG);
137 
138          /* Performance counters with SQTT are considered deprecated. */
139          uint32_t token_exclude = V_0367B8_TOKEN_EXCLUDE_PERF;
140 
141          if (!radv_is_instruction_timing_enabled()) {
142             /* Reduce SQTT traffic when instruction timing isn't enabled. */
143             token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC |
144                              V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE |
145                              V_0367B8_TOKEN_EXCLUDE_INST;
146          }
147          sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE(1);
148 
149          radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
150 
151          /* Should be emitted last (it enables thread traces). */
152          radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B0_SQ_THREAD_TRACE_CTRL, gfx11_get_sqtt_ctrl(device, true));
153 
154       } else if (device->physical_device->rad_info.gfx_level >= GFX10) {
155          /* Order seems important for the following 2 registers. */
156          radeon_set_privileged_config_reg(cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
157                                           S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
158 
159          radeon_set_privileged_config_reg(cs, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
160 
161          radeon_set_privileged_config_reg(cs, R_008D14_SQ_THREAD_TRACE_MASK,
162                                           S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) |
163                                              S_008D14_WGP_SEL(active_cu / 2) | S_008D14_SIMD_SEL(0));
164 
165          uint32_t sqtt_token_mask = S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC |
166                                                          V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_COMP |
167                                                          V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG);
168 
169          /* Performance counters with SQTT are considered deprecated. */
170          uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
171 
172          if (!radv_is_instruction_timing_enabled()) {
173             /* Reduce SQTT traffic when instruction timing isn't enabled. */
174             token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC |
175                              V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
176                              V_008D18_TOKEN_EXCLUDE_INST;
177          }
178          sqtt_token_mask |=
179             S_008D18_TOKEN_EXCLUDE(token_exclude) | S_008D18_BOP_EVENTS_TOKEN_INCLUDE(gfx_level == GFX10_3);
180 
181          radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
182 
183          /* Should be emitted last (it enables thread traces). */
184          radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, gfx10_get_sqtt_ctrl(device, true));
185       } else {
186          /* Order seems important for the following 4 registers. */
187          radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2, S_030CDC_ADDR_HI(shifted_va >> 32));
188 
189          radeon_set_uconfig_reg(cs, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
190 
191          radeon_set_uconfig_reg(cs, R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size));
192 
193          radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1));
194 
195          uint32_t sqtt_mask = S_030CC8_CU_SEL(active_cu) | S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) |
196                               S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) |
197                               S_030CC8_SQ_STALL_EN(1);
198 
199          if (device->physical_device->rad_info.gfx_level < GFX9) {
200             sqtt_mask |= S_030CC8_RANDOM_SEED(0xffff);
201          }
202 
203          radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
204 
205          /* Trace all tokens and registers. */
206          radeon_set_uconfig_reg(cs, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
207                                 S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0));
208 
209          /* Enable SQTT perf counters for all CUs. */
210          radeon_set_uconfig_reg(cs, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
211                                 S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff));
212 
213          radeon_set_uconfig_reg(cs, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
214 
215          radeon_set_uconfig_reg(cs, R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4));
216 
217          if (device->physical_device->rad_info.gfx_level == GFX9) {
218             /* Reset thread trace status errors. */
219             radeon_set_uconfig_reg(cs, R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0));
220          }
221 
222          /* Enable the thread trace mode. */
223          uint32_t sqtt_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) |
224                               S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) |
225                               S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
226                               S_030CD8_MODE(1);
227 
228          if (device->physical_device->rad_info.gfx_level == GFX9) {
229             /* Count SQTT traffic in TCC perf counters. */
230             sqtt_mode |= S_030CD8_TC_PERF_EN(1);
231          }
232 
233          radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
234       }
235    }
236 
237    /* Restore global broadcasting. */
238    radeon_set_uconfig_reg(
239       cs, R_030800_GRBM_GFX_INDEX,
240       S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
241 
242    /* Start the thread trace with a different event based on the queue. */
243    if (qf == RADV_QUEUE_COMPUTE) {
244       radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1));
245    } else {
246       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
247       radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
248    }
249 }
250 
251 static const uint32_t gfx8_sqtt_info_regs[] = {
252    R_030CE4_SQ_THREAD_TRACE_WPTR,
253    R_030CE8_SQ_THREAD_TRACE_STATUS,
254    R_008E40_SQ_THREAD_TRACE_CNTR,
255 };
256 
257 static const uint32_t gfx9_sqtt_info_regs[] = {
258    R_030CE4_SQ_THREAD_TRACE_WPTR,
259    R_030CE8_SQ_THREAD_TRACE_STATUS,
260    R_030CF0_SQ_THREAD_TRACE_CNTR,
261 };
262 
263 static const uint32_t gfx10_sqtt_info_regs[] = {
264    R_008D10_SQ_THREAD_TRACE_WPTR,
265    R_008D20_SQ_THREAD_TRACE_STATUS,
266    R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
267 };
268 
269 static const uint32_t gfx11_sqtt_info_regs[] = {
270    R_0367BC_SQ_THREAD_TRACE_WPTR,
271    R_0367D0_SQ_THREAD_TRACE_STATUS,
272    R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
273 };
274 static void
radv_copy_sqtt_info_regs(const struct radv_device * device,struct radeon_cmdbuf * cs,unsigned se_index)275 radv_copy_sqtt_info_regs(const struct radv_device *device, struct radeon_cmdbuf *cs, unsigned se_index)
276 {
277    const struct radv_physical_device *pdevice = device->physical_device;
278    const uint32_t *sqtt_info_regs = NULL;
279 
280    if (device->physical_device->rad_info.gfx_level >= GFX11) {
281       sqtt_info_regs = gfx11_sqtt_info_regs;
282    } else if (device->physical_device->rad_info.gfx_level >= GFX10) {
283       sqtt_info_regs = gfx10_sqtt_info_regs;
284    } else if (device->physical_device->rad_info.gfx_level == GFX9) {
285       sqtt_info_regs = gfx9_sqtt_info_regs;
286    } else {
287       assert(device->physical_device->rad_info.gfx_level == GFX8);
288       sqtt_info_regs = gfx8_sqtt_info_regs;
289    }
290 
291    /* Get the VA where the info struct is stored for this SE. */
292    uint64_t va = radv_buffer_get_va(device->sqtt.bo);
293    uint64_t info_va = ac_sqtt_get_info_va(va, se_index);
294 
295    /* Copy back the info struct one DWORD at a time. */
296    for (unsigned i = 0; i < 3; i++) {
297       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
298       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM);
299       radeon_emit(cs, sqtt_info_regs[i] >> 2);
300       radeon_emit(cs, 0); /* unused */
301       radeon_emit(cs, (info_va + i * 4));
302       radeon_emit(cs, (info_va + i * 4) >> 32);
303    }
304 
305    if (pdevice->rad_info.gfx_level >= GFX11) {
306       /* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
307        * To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
308        * subtract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
309        *
310        * 1) get the current buffer base address for this SE
311        * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
312        * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
313        */
314       uint64_t data_va = ac_sqtt_get_data_va(&pdevice->rad_info, &device->sqtt, va, se_index);
315       uint64_t shifted_data_va = (data_va >> 5);
316       uint32_t init_wptr_value = shifted_data_va & 0x1fffffff;
317 
318       radeon_emit(cs, PKT3(PKT3_ATOMIC_MEM, 7, 0));
319       radeon_emit(cs, ATOMIC_OP(TC_OP_ATOMIC_SUB_32));
320       radeon_emit(cs, info_va);         /* addr lo */
321       radeon_emit(cs, info_va >> 32);   /* addr hi */
322       radeon_emit(cs, init_wptr_value); /* data lo */
323       radeon_emit(cs, 0);               /* data hi */
324       radeon_emit(cs, 0);               /* compare data lo */
325       radeon_emit(cs, 0);               /* compare data hi */
326       radeon_emit(cs, 0);               /* loop interval */
327    }
328 }
329 
330 static void
radv_emit_sqtt_stop(const struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)331 radv_emit_sqtt_stop(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
332 {
333    const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
334    unsigned max_se = device->physical_device->rad_info.max_se;
335 
336    radeon_check_space(device->ws, cs, 8 + max_se * 64);
337 
338    /* Stop the thread trace with a different event based on the queue. */
339    if (qf == RADV_QUEUE_COMPUTE) {
340       radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0));
341    } else {
342       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
343       radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
344    }
345 
346    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
347    radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
348 
349    if (device->physical_device->rad_info.has_sqtt_rb_harvest_bug) {
350       /* Some chips with disabled RBs should wait for idle because FINISH_DONE doesn't work. */
351       radv_emit_wait_for_idle(device, cs, qf);
352    }
353 
354    for (unsigned se = 0; se < max_se; se++) {
355       if (ac_sqtt_se_is_disabled(&device->physical_device->rad_info, se))
356          continue;
357 
358       /* Target SEi and SH0. */
359       radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
360                              S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
361 
362       if (device->physical_device->rad_info.gfx_level >= GFX11) {
363          /* Make sure to wait for the trace buffer. */
364          radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
365          radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
366          radeon_emit(cs, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
367          radeon_emit(cs, 0);
368          radeon_emit(cs, 0); /* reference value */
369          radeon_emit(cs, ~C_0367D0_FINISH_DONE);
370          radeon_emit(cs, 4); /* poll interval */
371 
372          /* Disable the thread trace mode. */
373          radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B0_SQ_THREAD_TRACE_CTRL, gfx11_get_sqtt_ctrl(device, false));
374 
375          /* Wait for thread trace completion. */
376          radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
377          radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
378          radeon_emit(cs, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
379          radeon_emit(cs, 0);
380          radeon_emit(cs, 0);              /* reference value */
381          radeon_emit(cs, ~C_0367D0_BUSY); /* mask */
382          radeon_emit(cs, 4);              /* poll interval */
383       } else if (device->physical_device->rad_info.gfx_level >= GFX10) {
384          if (!device->physical_device->rad_info.has_sqtt_rb_harvest_bug) {
385             /* Make sure to wait for the trace buffer. */
386             radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
387             radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
388             radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
389             radeon_emit(cs, 0);
390             radeon_emit(cs, 0); /* reference value */
391             radeon_emit(cs, ~C_008D20_FINISH_DONE);
392             radeon_emit(cs, 4); /* poll interval */
393          }
394 
395          /* Disable the thread trace mode. */
396          radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, gfx10_get_sqtt_ctrl(device, false));
397 
398          /* Wait for thread trace completion. */
399          radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
400          radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
401          radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
402          radeon_emit(cs, 0);
403          radeon_emit(cs, 0);              /* reference value */
404          radeon_emit(cs, ~C_008D20_BUSY); /* mask */
405          radeon_emit(cs, 4);              /* poll interval */
406       } else {
407          /* Disable the thread trace mode. */
408          radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
409 
410          /* Wait for thread trace completion. */
411          radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
412          radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
413          radeon_emit(cs, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
414          radeon_emit(cs, 0);
415          radeon_emit(cs, 0);              /* reference value */
416          radeon_emit(cs, ~C_030CE8_BUSY); /* mask */
417          radeon_emit(cs, 4);              /* poll interval */
418       }
419 
420       radv_copy_sqtt_info_regs(device, cs, se);
421    }
422 
423    /* Restore global broadcasting. */
424    radeon_set_uconfig_reg(
425       cs, R_030800_GRBM_GFX_INDEX,
426       S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
427 }
428 
429 void
radv_emit_sqtt_userdata(const struct radv_cmd_buffer * cmd_buffer,const void * data,uint32_t num_dwords)430 radv_emit_sqtt_userdata(const struct radv_cmd_buffer *cmd_buffer, const void *data, uint32_t num_dwords)
431 {
432    const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
433    const enum radv_queue_family qf = cmd_buffer->qf;
434    struct radv_device *device = cmd_buffer->device;
435    struct radeon_cmdbuf *cs = cmd_buffer->cs;
436    const uint32_t *dwords = (uint32_t *)data;
437 
438    /* SQTT user data packets aren't supported on SDMA queues. */
439    if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
440       return;
441 
442    while (num_dwords > 0) {
443       uint32_t count = MIN2(num_dwords, 2);
444 
445       radeon_check_space(device->ws, cs, 2 + count);
446 
447       /* Without the perfctr bit the CP might not always pass the
448        * write on correctly. */
449       if (device->physical_device->rad_info.gfx_level >= GFX10)
450          radeon_set_uconfig_reg_seq_perfctr(gfx_level, qf, cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
451       else
452          radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
453       radeon_emit_array(cs, dwords, count);
454 
455       dwords += count;
456       num_dwords -= count;
457    }
458 }
459 
460 void
radv_emit_spi_config_cntl(const struct radv_device * device,struct radeon_cmdbuf * cs,bool enable)461 radv_emit_spi_config_cntl(const struct radv_device *device, struct radeon_cmdbuf *cs, bool enable)
462 {
463    if (device->physical_device->rad_info.gfx_level >= GFX9) {
464       uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) |
465                                  S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable);
466 
467       if (device->physical_device->rad_info.gfx_level >= GFX10)
468          spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
469 
470       radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
471    } else {
472       /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
473       radeon_set_privileged_config_reg(cs, R_009100_SPI_CONFIG_CNTL,
474                                        S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable));
475    }
476 }
477 
478 void
radv_emit_inhibit_clockgating(const struct radv_device * device,struct radeon_cmdbuf * cs,bool inhibit)479 radv_emit_inhibit_clockgating(const struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit)
480 {
481    if (device->physical_device->rad_info.gfx_level >= GFX11)
482       return; /* not needed */
483 
484    if (device->physical_device->rad_info.gfx_level >= GFX10) {
485       radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL, S_037390_PERFMON_CLOCK_STATE(inhibit));
486    } else if (device->physical_device->rad_info.gfx_level >= GFX8) {
487       radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL, S_0372FC_PERFMON_CLOCK_STATE(inhibit));
488    }
489 }
490 
491 VkResult
radv_sqtt_acquire_gpu_timestamp(struct radv_device * device,struct radeon_winsys_bo ** gpu_timestamp_bo,uint32_t * gpu_timestamp_offset,void ** gpu_timestamp_ptr)492 radv_sqtt_acquire_gpu_timestamp(struct radv_device *device, struct radeon_winsys_bo **gpu_timestamp_bo,
493                                 uint32_t *gpu_timestamp_offset, void **gpu_timestamp_ptr)
494 {
495    struct radeon_winsys *ws = device->ws;
496 
497    simple_mtx_lock(&device->sqtt_timestamp_mtx);
498 
499    if (device->sqtt_timestamp.offset + 8 > device->sqtt_timestamp.size) {
500       struct radeon_winsys_bo *bo;
501       uint64_t new_size;
502       VkResult result;
503       uint8_t *map;
504 
505       new_size = MAX2(4096, 2 * device->sqtt_timestamp.size);
506 
507       result = ws->buffer_create(ws, new_size, 8, RADEON_DOMAIN_GTT,
508                                  RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_SCRATCH,
509                                  0, &bo);
510       if (result != VK_SUCCESS) {
511          simple_mtx_unlock(&device->sqtt_timestamp_mtx);
512          return result;
513       }
514 
515       map = device->ws->buffer_map(bo);
516       if (!map) {
517          ws->buffer_destroy(ws, bo);
518          simple_mtx_unlock(&device->sqtt_timestamp_mtx);
519          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
520       }
521 
522       if (device->sqtt_timestamp.bo) {
523          struct radv_sqtt_timestamp *new_timestamp;
524 
525          new_timestamp = malloc(sizeof(*new_timestamp));
526          if (!new_timestamp) {
527             ws->buffer_destroy(ws, bo);
528             simple_mtx_unlock(&device->sqtt_timestamp_mtx);
529             return VK_ERROR_OUT_OF_HOST_MEMORY;
530          }
531 
532          memcpy(new_timestamp, &device->sqtt_timestamp, sizeof(*new_timestamp));
533          list_add(&new_timestamp->list, &device->sqtt_timestamp.list);
534       }
535 
536       device->sqtt_timestamp.bo = bo;
537       device->sqtt_timestamp.size = new_size;
538       device->sqtt_timestamp.offset = 0;
539       device->sqtt_timestamp.map = map;
540    }
541 
542    *gpu_timestamp_bo = device->sqtt_timestamp.bo;
543    *gpu_timestamp_offset = device->sqtt_timestamp.offset;
544    *gpu_timestamp_ptr = device->sqtt_timestamp.map + device->sqtt_timestamp.offset;
545 
546    device->sqtt_timestamp.offset += 8;
547 
548    simple_mtx_unlock(&device->sqtt_timestamp_mtx);
549 
550    return VK_SUCCESS;
551 }
552 
553 static void
radv_sqtt_reset_timestamp(struct radv_device * device)554 radv_sqtt_reset_timestamp(struct radv_device *device)
555 {
556    struct radeon_winsys *ws = device->ws;
557 
558    simple_mtx_lock(&device->sqtt_timestamp_mtx);
559 
560    list_for_each_entry_safe (struct radv_sqtt_timestamp, ts, &device->sqtt_timestamp.list, list) {
561       ws->buffer_destroy(ws, ts->bo);
562       list_del(&ts->list);
563       free(ts);
564    }
565 
566    device->sqtt_timestamp.offset = 0;
567 
568    simple_mtx_unlock(&device->sqtt_timestamp_mtx);
569 }
570 
571 static bool
radv_sqtt_init_queue_event(struct radv_device * device)572 radv_sqtt_init_queue_event(struct radv_device *device)
573 {
574    VkCommandPool cmd_pool;
575    VkResult result;
576 
577    const VkCommandPoolCreateInfo create_gfx_info = {
578       .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
579       .queueFamilyIndex = RADV_QUEUE_GENERAL, /* Graphics queue is always the first queue. */
580    };
581 
582    result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_gfx_info, NULL, &cmd_pool);
583    if (result != VK_SUCCESS)
584       return false;
585 
586    device->sqtt_command_pool[0] = vk_command_pool_from_handle(cmd_pool);
587 
588    if (!(device->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
589       const VkCommandPoolCreateInfo create_comp_info = {
590          .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
591          .queueFamilyIndex = RADV_QUEUE_COMPUTE,
592       };
593 
594       result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_comp_info, NULL, &cmd_pool);
595       if (result != VK_SUCCESS)
596          return false;
597 
598       device->sqtt_command_pool[1] = vk_command_pool_from_handle(cmd_pool);
599    }
600 
601    simple_mtx_init(&device->sqtt_command_pool_mtx, mtx_plain);
602 
603    simple_mtx_init(&device->sqtt_timestamp_mtx, mtx_plain);
604    list_inithead(&device->sqtt_timestamp.list);
605 
606    return true;
607 }
608 
609 static void
radv_sqtt_finish_queue_event(struct radv_device * device)610 radv_sqtt_finish_queue_event(struct radv_device *device)
611 {
612    struct radeon_winsys *ws = device->ws;
613 
614    if (device->sqtt_timestamp.bo)
615       ws->buffer_destroy(ws, device->sqtt_timestamp.bo);
616 
617    simple_mtx_destroy(&device->sqtt_timestamp_mtx);
618 
619    for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++)
620       vk_common_DestroyCommandPool(radv_device_to_handle(device),
621                                    vk_command_pool_to_handle(device->sqtt_command_pool[i]), NULL);
622 
623    simple_mtx_destroy(&device->sqtt_command_pool_mtx);
624 }
625 
626 static bool
radv_sqtt_init_bo(struct radv_device * device)627 radv_sqtt_init_bo(struct radv_device *device)
628 {
629    unsigned max_se = device->physical_device->rad_info.max_se;
630    struct radeon_winsys *ws = device->ws;
631    VkResult result;
632    uint64_t size;
633 
634    /* The buffer size and address need to be aligned in HW regs. Align the
635     * size as early as possible so that we do all the allocation & addressing
636     * correctly. */
637    device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
638 
639    /* Compute total size of the thread trace BO for all SEs. */
640    size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
641    size += device->sqtt.buffer_size * (uint64_t)max_se;
642 
643    struct radeon_winsys_bo *bo = NULL;
644    result = ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_VRAM,
645                               RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
646                               RADV_BO_PRIORITY_SCRATCH, 0, &bo);
647    device->sqtt.bo = bo;
648    if (result != VK_SUCCESS)
649       return false;
650 
651    result = ws->buffer_make_resident(ws, device->sqtt.bo, true);
652    if (result != VK_SUCCESS)
653       return false;
654 
655    device->sqtt.ptr = ws->buffer_map(device->sqtt.bo);
656    if (!device->sqtt.ptr)
657       return false;
658 
659    return true;
660 }
661 
662 static void
radv_sqtt_finish_bo(struct radv_device * device)663 radv_sqtt_finish_bo(struct radv_device *device)
664 {
665    struct radeon_winsys *ws = device->ws;
666 
667    if (unlikely(device->sqtt.bo)) {
668       ws->buffer_make_resident(ws, device->sqtt.bo, false);
669       ws->buffer_destroy(ws, device->sqtt.bo);
670    }
671 }
672 
673 static VkResult
radv_register_queue(struct radv_device * device,struct radv_queue * queue)674 radv_register_queue(struct radv_device *device, struct radv_queue *queue)
675 {
676    struct ac_sqtt *sqtt = &device->sqtt;
677    struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
678    struct rgp_queue_info_record *record;
679 
680    record = malloc(sizeof(struct rgp_queue_info_record));
681    if (!record)
682       return VK_ERROR_OUT_OF_HOST_MEMORY;
683 
684    record->queue_id = (uintptr_t)queue;
685    record->queue_context = (uintptr_t)queue->hw_ctx;
686    if (queue->vk.queue_family_index == RADV_QUEUE_GENERAL) {
687       record->hardware_info.queue_type = SQTT_QUEUE_TYPE_UNIVERSAL;
688       record->hardware_info.engine_type = SQTT_ENGINE_TYPE_UNIVERSAL;
689    } else {
690       record->hardware_info.queue_type = SQTT_QUEUE_TYPE_COMPUTE;
691       record->hardware_info.engine_type = SQTT_ENGINE_TYPE_COMPUTE;
692    }
693 
694    simple_mtx_lock(&queue_info->lock);
695    list_addtail(&record->list, &queue_info->record);
696    queue_info->record_count++;
697    simple_mtx_unlock(&queue_info->lock);
698 
699    return VK_SUCCESS;
700 }
701 
702 static void
radv_unregister_queue(struct radv_device * device,struct radv_queue * queue)703 radv_unregister_queue(struct radv_device *device, struct radv_queue *queue)
704 {
705    struct ac_sqtt *sqtt = &device->sqtt;
706    struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
707 
708    /* Destroy queue info record. */
709    simple_mtx_lock(&queue_info->lock);
710    if (queue_info->record_count > 0) {
711       list_for_each_entry_safe (struct rgp_queue_info_record, record, &queue_info->record, list) {
712          if (record->queue_id == (uintptr_t)queue) {
713             queue_info->record_count--;
714             list_del(&record->list);
715             free(record);
716             break;
717          }
718       }
719    }
720    simple_mtx_unlock(&queue_info->lock);
721 }
722 
723 static void
radv_register_queues(struct radv_device * device,struct ac_sqtt * sqtt)724 radv_register_queues(struct radv_device *device, struct ac_sqtt *sqtt)
725 {
726    if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
727       radv_register_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
728 
729    for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
730       radv_register_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
731 }
732 
733 static void
radv_unregister_queues(struct radv_device * device,struct ac_sqtt * sqtt)734 radv_unregister_queues(struct radv_device *device, struct ac_sqtt *sqtt)
735 {
736    if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
737       radv_unregister_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
738 
739    for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
740       radv_unregister_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
741 }
742 
743 bool
radv_sqtt_init(struct radv_device * device)744 radv_sqtt_init(struct radv_device *device)
745 {
746    struct ac_sqtt *sqtt = &device->sqtt;
747 
748    /* Default buffer size set to 32MB per SE. */
749    device->sqtt.buffer_size = (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024);
750 
751    if (!radv_sqtt_init_bo(device))
752       return false;
753 
754    if (!radv_sqtt_init_queue_event(device))
755       return false;
756 
757    if (!radv_device_acquire_performance_counters(device))
758       return false;
759 
760    ac_sqtt_init(sqtt);
761 
762    radv_register_queues(device, sqtt);
763 
764    return true;
765 }
766 
767 void
radv_sqtt_finish(struct radv_device * device)768 radv_sqtt_finish(struct radv_device *device)
769 {
770    struct ac_sqtt *sqtt = &device->sqtt;
771    struct radeon_winsys *ws = device->ws;
772 
773    radv_sqtt_finish_bo(device);
774    radv_sqtt_finish_queue_event(device);
775 
776    for (unsigned i = 0; i < 2; i++) {
777       if (device->sqtt.start_cs[i])
778          ws->cs_destroy(device->sqtt.start_cs[i]);
779       if (device->sqtt.stop_cs[i])
780          ws->cs_destroy(device->sqtt.stop_cs[i]);
781    }
782 
783    radv_unregister_queues(device, sqtt);
784 
785    ac_sqtt_finish(sqtt);
786 }
787 
788 static bool
radv_sqtt_resize_bo(struct radv_device * device)789 radv_sqtt_resize_bo(struct radv_device *device)
790 {
791    /* Destroy the previous thread trace BO. */
792    radv_sqtt_finish_bo(device);
793 
794    /* Double the size of the thread trace buffer per SE. */
795    device->sqtt.buffer_size *= 2;
796 
797    fprintf(stderr,
798            "Failed to get the thread trace because the buffer "
799            "was too small, resizing to %d KB\n",
800            device->sqtt.buffer_size / 1024);
801 
802    /* Re-create the thread trace BO. */
803    return radv_sqtt_init_bo(device);
804 }
805 
806 bool
radv_begin_sqtt(struct radv_queue * queue)807 radv_begin_sqtt(struct radv_queue *queue)
808 {
809    struct radv_device *device = queue->device;
810    enum radv_queue_family family = queue->state.qf;
811    struct radeon_winsys *ws = device->ws;
812    struct radeon_cmdbuf *cs;
813    VkResult result;
814 
815    /* Destroy the previous start CS and create a new one. */
816    if (device->sqtt.start_cs[family]) {
817       ws->cs_destroy(device->sqtt.start_cs[family]);
818       device->sqtt.start_cs[family] = NULL;
819    }
820 
821    cs = ws->cs_create(ws, radv_queue_ring(queue), false);
822    if (!cs)
823       return false;
824 
825    radeon_check_space(ws, cs, 512);
826 
827    switch (family) {
828    case RADV_QUEUE_GENERAL:
829       radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
830       radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
831       radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
832       break;
833    case RADV_QUEUE_COMPUTE:
834       radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
835       radeon_emit(cs, 0);
836       break;
837    default:
838       unreachable("Incorrect queue family");
839       break;
840    }
841 
842    /* Make sure to wait-for-idle before starting SQTT. */
843    radv_emit_wait_for_idle(device, cs, family);
844 
845    /* Disable clock gating before starting SQTT. */
846    radv_emit_inhibit_clockgating(device, cs, true);
847 
848    /* Enable SQG events that collects thread trace data. */
849    radv_emit_spi_config_cntl(device, cs, true);
850 
851    radv_perfcounter_emit_spm_reset(cs);
852 
853    if (device->spm.bo) {
854       /* Enable all shader stages by default. */
855       radv_perfcounter_emit_shaders(device, cs, ac_sqtt_get_shader_mask(&device->physical_device->rad_info));
856 
857       radv_emit_spm_setup(device, cs, family);
858    }
859 
860    /* Start SQTT. */
861    radv_emit_sqtt_start(device, cs, family);
862 
863    if (device->spm.bo)
864       radv_perfcounter_emit_spm_start(device, cs, family);
865 
866    result = ws->cs_finalize(cs);
867    if (result != VK_SUCCESS) {
868       ws->cs_destroy(cs);
869       return false;
870    }
871 
872    device->sqtt.start_cs[family] = cs;
873 
874    return radv_queue_internal_submit(queue, cs);
875 }
876 
877 bool
radv_end_sqtt(struct radv_queue * queue)878 radv_end_sqtt(struct radv_queue *queue)
879 {
880    struct radv_device *device = queue->device;
881    enum radv_queue_family family = queue->state.qf;
882    struct radeon_winsys *ws = device->ws;
883    struct radeon_cmdbuf *cs;
884    VkResult result;
885 
886    /* Destroy the previous stop CS and create a new one. */
887    if (queue->device->sqtt.stop_cs[family]) {
888       ws->cs_destroy(device->sqtt.stop_cs[family]);
889       device->sqtt.stop_cs[family] = NULL;
890    }
891 
892    cs = ws->cs_create(ws, radv_queue_ring(queue), false);
893    if (!cs)
894       return false;
895 
896    radeon_check_space(ws, cs, 512);
897 
898    switch (family) {
899    case RADV_QUEUE_GENERAL:
900       radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
901       radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
902       radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
903       break;
904    case RADV_QUEUE_COMPUTE:
905       radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
906       radeon_emit(cs, 0);
907       break;
908    default:
909       unreachable("Incorrect queue family");
910       break;
911    }
912 
913    /* Make sure to wait-for-idle before stopping SQTT. */
914    radv_emit_wait_for_idle(device, cs, family);
915 
916    if (device->spm.bo)
917       radv_perfcounter_emit_spm_stop(device, cs, family);
918 
919    /* Stop SQTT. */
920    radv_emit_sqtt_stop(device, cs, family);
921 
922    radv_perfcounter_emit_spm_reset(cs);
923 
924    /* Restore previous state by disabling SQG events. */
925    radv_emit_spi_config_cntl(device, cs, false);
926 
927    /* Restore previous state by re-enabling clock gating. */
928    radv_emit_inhibit_clockgating(device, cs, false);
929 
930    result = ws->cs_finalize(cs);
931    if (result != VK_SUCCESS) {
932       ws->cs_destroy(cs);
933       return false;
934    }
935 
936    device->sqtt.stop_cs[family] = cs;
937 
938    return radv_queue_internal_submit(queue, cs);
939 }
940 
941 bool
radv_get_sqtt_trace(struct radv_queue * queue,struct ac_sqtt_trace * sqtt_trace)942 radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_trace)
943 {
944    struct radv_device *device = queue->device;
945    const struct radeon_info *rad_info = &device->physical_device->rad_info;
946 
947    if (!ac_sqtt_get_trace(&device->sqtt, rad_info, sqtt_trace)) {
948       if (!radv_sqtt_resize_bo(device))
949          fprintf(stderr, "radv: Failed to resize the SQTT buffer.\n");
950       return false;
951    }
952 
953    return true;
954 }
955 
956 void
radv_reset_sqtt_trace(struct radv_device * device)957 radv_reset_sqtt_trace(struct radv_device *device)
958 {
959    struct ac_sqtt *sqtt = &device->sqtt;
960    struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
961    struct rgp_queue_event *queue_event = &sqtt->rgp_queue_event;
962 
963    /* Clear clock calibration records. */
964    simple_mtx_lock(&clock_calibration->lock);
965    list_for_each_entry_safe (struct rgp_clock_calibration_record, record, &clock_calibration->record, list) {
966       clock_calibration->record_count--;
967       list_del(&record->list);
968       free(record);
969    }
970    simple_mtx_unlock(&clock_calibration->lock);
971 
972    /* Clear queue event records. */
973    simple_mtx_lock(&queue_event->lock);
974    list_for_each_entry_safe (struct rgp_queue_event_record, record, &queue_event->record, list) {
975       list_del(&record->list);
976       free(record);
977    }
978    queue_event->record_count = 0;
979    simple_mtx_unlock(&queue_event->lock);
980 
981    /* Clear timestamps. */
982    radv_sqtt_reset_timestamp(device);
983 
984    /* Clear timed cmdbufs. */
985    simple_mtx_lock(&device->sqtt_command_pool_mtx);
986    for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++) {
987       vk_common_TrimCommandPool(radv_device_to_handle(device), vk_command_pool_to_handle(device->sqtt_command_pool[i]),
988                                 0);
989    }
990    simple_mtx_unlock(&device->sqtt_command_pool_mtx);
991 }
992 
993 static VkResult
radv_get_calibrated_timestamps(struct radv_device * device,uint64_t * cpu_timestamp,uint64_t * gpu_timestamp)994 radv_get_calibrated_timestamps(struct radv_device *device, uint64_t *cpu_timestamp, uint64_t *gpu_timestamp)
995 {
996    uint64_t timestamps[2];
997    uint64_t max_deviation;
998    VkResult result;
999 
1000    const VkCalibratedTimestampInfoKHR timestamp_infos[2] = {{
1001                                                                .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
1002                                                                .timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
1003                                                             },
1004                                                             {
1005                                                                .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
1006                                                                .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR,
1007                                                             }};
1008 
1009    result =
1010       radv_GetCalibratedTimestampsKHR(radv_device_to_handle(device), 2, timestamp_infos, timestamps, &max_deviation);
1011    if (result != VK_SUCCESS)
1012       return result;
1013 
1014    *cpu_timestamp = timestamps[0];
1015    *gpu_timestamp = timestamps[1];
1016 
1017    return result;
1018 }
1019 
1020 bool
radv_sqtt_sample_clocks(struct radv_device * device)1021 radv_sqtt_sample_clocks(struct radv_device *device)
1022 {
1023    uint64_t cpu_timestamp = 0, gpu_timestamp = 0;
1024    VkResult result;
1025 
1026    result = radv_get_calibrated_timestamps(device, &cpu_timestamp, &gpu_timestamp);
1027    if (result != VK_SUCCESS)
1028       return false;
1029 
1030    return ac_sqtt_add_clock_calibration(&device->sqtt, cpu_timestamp, gpu_timestamp);
1031 }
1032 
1033 VkResult
radv_sqtt_get_timed_cmdbuf(struct radv_queue * queue,struct radeon_winsys_bo * timestamp_bo,uint32_t timestamp_offset,VkPipelineStageFlags2 timestamp_stage,VkCommandBuffer * pcmdbuf)1034 radv_sqtt_get_timed_cmdbuf(struct radv_queue *queue, struct radeon_winsys_bo *timestamp_bo, uint32_t timestamp_offset,
1035                            VkPipelineStageFlags2 timestamp_stage, VkCommandBuffer *pcmdbuf)
1036 {
1037    struct radv_device *device = queue->device;
1038    enum radv_queue_family queue_family = queue->state.qf;
1039    VkCommandBuffer cmdbuf;
1040    uint64_t timestamp_va;
1041    VkResult result;
1042 
1043    assert(queue_family == RADV_QUEUE_GENERAL || queue_family == RADV_QUEUE_COMPUTE);
1044 
1045    simple_mtx_lock(&device->sqtt_command_pool_mtx);
1046 
1047    const VkCommandBufferAllocateInfo alloc_info = {
1048       .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
1049       .commandPool = vk_command_pool_to_handle(device->sqtt_command_pool[queue_family]),
1050       .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
1051       .commandBufferCount = 1,
1052    };
1053 
1054    result = vk_common_AllocateCommandBuffers(radv_device_to_handle(device), &alloc_info, &cmdbuf);
1055    if (result != VK_SUCCESS)
1056       goto fail;
1057 
1058    const VkCommandBufferBeginInfo begin_info = {
1059       .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
1060       .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
1061    };
1062 
1063    result = radv_BeginCommandBuffer(cmdbuf, &begin_info);
1064    if (result != VK_SUCCESS)
1065       goto fail;
1066 
1067    radeon_check_space(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, 28);
1068 
1069    timestamp_va = radv_buffer_get_va(timestamp_bo) + timestamp_offset;
1070 
1071    radv_cs_add_buffer(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, timestamp_bo);
1072 
1073    radv_write_timestamp(radv_cmd_buffer_from_handle(cmdbuf), timestamp_va, timestamp_stage);
1074 
1075    result = radv_EndCommandBuffer(cmdbuf);
1076    if (result != VK_SUCCESS)
1077       goto fail;
1078 
1079    *pcmdbuf = cmdbuf;
1080 
1081 fail:
1082    simple_mtx_unlock(&device->sqtt_command_pool_mtx);
1083    return result;
1084 }
1085