1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "radv_cs.h"
27 #include "radv_debug.h"
28 #include "radv_private.h"
29 #include "sid.h"
30
31 #include "vk_common_entrypoints.h"
32
33 #define SQTT_BUFFER_ALIGN_SHIFT 12
34
35 bool
radv_is_instruction_timing_enabled(void)36 radv_is_instruction_timing_enabled(void)
37 {
38 return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true);
39 }
40
41 bool
radv_sqtt_queue_events_enabled(void)42 radv_sqtt_queue_events_enabled(void)
43 {
44 return debug_get_bool_option("RADV_THREAD_TRACE_QUEUE_EVENTS", true);
45 }
46
47 static uint32_t
gfx11_get_sqtt_ctrl(const struct radv_device * device,bool enable)48 gfx11_get_sqtt_ctrl(const struct radv_device *device, bool enable)
49 {
50 return S_0367B0_MODE(enable) | S_0367B0_HIWATER(5) | S_0367B0_UTIL_TIMER(1) | S_0367B0_RT_FREQ(2) | /* 4096 clk */
51 S_0367B0_DRAW_EVENT_EN(1) | S_0367B0_SPI_STALL_EN(1) | S_0367B0_SQ_STALL_EN(1) | S_0367B0_REG_AT_HWM(2);
52 }
53
54 static uint32_t
gfx10_get_sqtt_ctrl(const struct radv_device * device,bool enable)55 gfx10_get_sqtt_ctrl(const struct radv_device *device, bool enable)
56 {
57 uint32_t sqtt_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) |
58 S_008D1C_RT_FREQ(2) | /* 4096 clk */
59 S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) |
60 S_008D1C_SQ_STALL_EN(1) | S_008D1C_REG_DROP_ON_STALL(0);
61
62 if (device->physical_device->rad_info.gfx_level == GFX10_3)
63 sqtt_ctrl |= S_008D1C_LOWATER_OFFSET(4);
64
65 if (device->physical_device->rad_info.has_sqtt_auto_flush_mode_bug)
66 sqtt_ctrl |= S_008D1C_AUTO_FLUSH_MODE(1);
67
68 return sqtt_ctrl;
69 }
70
71 static enum radv_queue_family
radv_ip_to_queue_family(enum amd_ip_type t)72 radv_ip_to_queue_family(enum amd_ip_type t)
73 {
74 switch (t) {
75 case AMD_IP_GFX:
76 return RADV_QUEUE_GENERAL;
77 case AMD_IP_COMPUTE:
78 return RADV_QUEUE_COMPUTE;
79 case AMD_IP_SDMA:
80 return RADV_QUEUE_TRANSFER;
81 default:
82 unreachable("Unknown IP type");
83 }
84 }
85
86 static void
radv_emit_wait_for_idle(const struct radv_device * device,struct radeon_cmdbuf * cs,int family)87 radv_emit_wait_for_idle(const struct radv_device *device, struct radeon_cmdbuf *cs, int family)
88 {
89 const enum radv_queue_family qf = radv_ip_to_queue_family(family);
90 enum rgp_flush_bits sqtt_flush_bits = 0;
91 radv_cs_emit_cache_flush(
92 device->ws, cs, device->physical_device->rad_info.gfx_level, NULL, 0, qf,
93 (family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH
94 : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
95 RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2,
96 &sqtt_flush_bits, 0);
97 }
98
99 static void
radv_emit_sqtt_start(const struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)100 radv_emit_sqtt_start(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
101 {
102 const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
103 uint32_t shifted_size = device->sqtt.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
104 const struct radeon_info *rad_info = &device->physical_device->rad_info;
105 const unsigned shader_mask = ac_sqtt_get_shader_mask(rad_info);
106 unsigned max_se = rad_info->max_se;
107
108 radeon_check_space(device->ws, cs, 6 + max_se * 33);
109
110 for (unsigned se = 0; se < max_se; se++) {
111 uint64_t va = radv_buffer_get_va(device->sqtt.bo);
112 uint64_t data_va = ac_sqtt_get_data_va(rad_info, &device->sqtt, va, se);
113 uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
114 int active_cu = ac_sqtt_get_active_cu(&device->physical_device->rad_info, se);
115
116 if (ac_sqtt_se_is_disabled(rad_info, se))
117 continue;
118
119 /* Target SEx and SH0. */
120 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
121 S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
122
123 if (device->physical_device->rad_info.gfx_level >= GFX11) {
124 /* Order seems important for the following 2 registers. */
125 radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE,
126 S_0367A4_SIZE(shifted_size) | S_0367A4_BASE_HI(shifted_va >> 32));
127
128 radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
129
130 radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B4_SQ_THREAD_TRACE_MASK,
131 S_0367B4_WTYPE_INCLUDE(shader_mask) | S_0367B4_SA_SEL(0) |
132 S_0367B4_WGP_SEL(active_cu / 2) | S_0367B4_SIMD_SEL(0));
133
134 uint32_t sqtt_token_mask = S_0367B8_REG_INCLUDE(V_0367B8_REG_INCLUDE_SQDEC | V_0367B8_REG_INCLUDE_SHDEC |
135 V_0367B8_REG_INCLUDE_GFXUDEC | V_0367B8_REG_INCLUDE_COMP |
136 V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG);
137
138 /* Performance counters with SQTT are considered deprecated. */
139 uint32_t token_exclude = V_0367B8_TOKEN_EXCLUDE_PERF;
140
141 if (!radv_is_instruction_timing_enabled()) {
142 /* Reduce SQTT traffic when instruction timing isn't enabled. */
143 token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC |
144 V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE |
145 V_0367B8_TOKEN_EXCLUDE_INST;
146 }
147 sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE(1);
148
149 radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
150
151 /* Should be emitted last (it enables thread traces). */
152 radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B0_SQ_THREAD_TRACE_CTRL, gfx11_get_sqtt_ctrl(device, true));
153
154 } else if (device->physical_device->rad_info.gfx_level >= GFX10) {
155 /* Order seems important for the following 2 registers. */
156 radeon_set_privileged_config_reg(cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
157 S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
158
159 radeon_set_privileged_config_reg(cs, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
160
161 radeon_set_privileged_config_reg(cs, R_008D14_SQ_THREAD_TRACE_MASK,
162 S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) |
163 S_008D14_WGP_SEL(active_cu / 2) | S_008D14_SIMD_SEL(0));
164
165 uint32_t sqtt_token_mask = S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC |
166 V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_COMP |
167 V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG);
168
169 /* Performance counters with SQTT are considered deprecated. */
170 uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
171
172 if (!radv_is_instruction_timing_enabled()) {
173 /* Reduce SQTT traffic when instruction timing isn't enabled. */
174 token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC |
175 V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
176 V_008D18_TOKEN_EXCLUDE_INST;
177 }
178 sqtt_token_mask |=
179 S_008D18_TOKEN_EXCLUDE(token_exclude) | S_008D18_BOP_EVENTS_TOKEN_INCLUDE(gfx_level == GFX10_3);
180
181 radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
182
183 /* Should be emitted last (it enables thread traces). */
184 radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, gfx10_get_sqtt_ctrl(device, true));
185 } else {
186 /* Order seems important for the following 4 registers. */
187 radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2, S_030CDC_ADDR_HI(shifted_va >> 32));
188
189 radeon_set_uconfig_reg(cs, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
190
191 radeon_set_uconfig_reg(cs, R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size));
192
193 radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1));
194
195 uint32_t sqtt_mask = S_030CC8_CU_SEL(active_cu) | S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) |
196 S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) |
197 S_030CC8_SQ_STALL_EN(1);
198
199 if (device->physical_device->rad_info.gfx_level < GFX9) {
200 sqtt_mask |= S_030CC8_RANDOM_SEED(0xffff);
201 }
202
203 radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
204
205 /* Trace all tokens and registers. */
206 radeon_set_uconfig_reg(cs, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
207 S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0));
208
209 /* Enable SQTT perf counters for all CUs. */
210 radeon_set_uconfig_reg(cs, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
211 S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff));
212
213 radeon_set_uconfig_reg(cs, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
214
215 radeon_set_uconfig_reg(cs, R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4));
216
217 if (device->physical_device->rad_info.gfx_level == GFX9) {
218 /* Reset thread trace status errors. */
219 radeon_set_uconfig_reg(cs, R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0));
220 }
221
222 /* Enable the thread trace mode. */
223 uint32_t sqtt_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) |
224 S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) |
225 S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
226 S_030CD8_MODE(1);
227
228 if (device->physical_device->rad_info.gfx_level == GFX9) {
229 /* Count SQTT traffic in TCC perf counters. */
230 sqtt_mode |= S_030CD8_TC_PERF_EN(1);
231 }
232
233 radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
234 }
235 }
236
237 /* Restore global broadcasting. */
238 radeon_set_uconfig_reg(
239 cs, R_030800_GRBM_GFX_INDEX,
240 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
241
242 /* Start the thread trace with a different event based on the queue. */
243 if (qf == RADV_QUEUE_COMPUTE) {
244 radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1));
245 } else {
246 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
247 radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
248 }
249 }
250
251 static const uint32_t gfx8_sqtt_info_regs[] = {
252 R_030CE4_SQ_THREAD_TRACE_WPTR,
253 R_030CE8_SQ_THREAD_TRACE_STATUS,
254 R_008E40_SQ_THREAD_TRACE_CNTR,
255 };
256
257 static const uint32_t gfx9_sqtt_info_regs[] = {
258 R_030CE4_SQ_THREAD_TRACE_WPTR,
259 R_030CE8_SQ_THREAD_TRACE_STATUS,
260 R_030CF0_SQ_THREAD_TRACE_CNTR,
261 };
262
263 static const uint32_t gfx10_sqtt_info_regs[] = {
264 R_008D10_SQ_THREAD_TRACE_WPTR,
265 R_008D20_SQ_THREAD_TRACE_STATUS,
266 R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
267 };
268
269 static const uint32_t gfx11_sqtt_info_regs[] = {
270 R_0367BC_SQ_THREAD_TRACE_WPTR,
271 R_0367D0_SQ_THREAD_TRACE_STATUS,
272 R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
273 };
274 static void
radv_copy_sqtt_info_regs(const struct radv_device * device,struct radeon_cmdbuf * cs,unsigned se_index)275 radv_copy_sqtt_info_regs(const struct radv_device *device, struct radeon_cmdbuf *cs, unsigned se_index)
276 {
277 const struct radv_physical_device *pdevice = device->physical_device;
278 const uint32_t *sqtt_info_regs = NULL;
279
280 if (device->physical_device->rad_info.gfx_level >= GFX11) {
281 sqtt_info_regs = gfx11_sqtt_info_regs;
282 } else if (device->physical_device->rad_info.gfx_level >= GFX10) {
283 sqtt_info_regs = gfx10_sqtt_info_regs;
284 } else if (device->physical_device->rad_info.gfx_level == GFX9) {
285 sqtt_info_regs = gfx9_sqtt_info_regs;
286 } else {
287 assert(device->physical_device->rad_info.gfx_level == GFX8);
288 sqtt_info_regs = gfx8_sqtt_info_regs;
289 }
290
291 /* Get the VA where the info struct is stored for this SE. */
292 uint64_t va = radv_buffer_get_va(device->sqtt.bo);
293 uint64_t info_va = ac_sqtt_get_info_va(va, se_index);
294
295 /* Copy back the info struct one DWORD at a time. */
296 for (unsigned i = 0; i < 3; i++) {
297 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
298 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM);
299 radeon_emit(cs, sqtt_info_regs[i] >> 2);
300 radeon_emit(cs, 0); /* unused */
301 radeon_emit(cs, (info_va + i * 4));
302 radeon_emit(cs, (info_va + i * 4) >> 32);
303 }
304
305 if (pdevice->rad_info.gfx_level >= GFX11) {
306 /* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
307 * To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
308 * subtract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
309 *
310 * 1) get the current buffer base address for this SE
311 * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
312 * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
313 */
314 uint64_t data_va = ac_sqtt_get_data_va(&pdevice->rad_info, &device->sqtt, va, se_index);
315 uint64_t shifted_data_va = (data_va >> 5);
316 uint32_t init_wptr_value = shifted_data_va & 0x1fffffff;
317
318 radeon_emit(cs, PKT3(PKT3_ATOMIC_MEM, 7, 0));
319 radeon_emit(cs, ATOMIC_OP(TC_OP_ATOMIC_SUB_32));
320 radeon_emit(cs, info_va); /* addr lo */
321 radeon_emit(cs, info_va >> 32); /* addr hi */
322 radeon_emit(cs, init_wptr_value); /* data lo */
323 radeon_emit(cs, 0); /* data hi */
324 radeon_emit(cs, 0); /* compare data lo */
325 radeon_emit(cs, 0); /* compare data hi */
326 radeon_emit(cs, 0); /* loop interval */
327 }
328 }
329
330 static void
radv_emit_sqtt_stop(const struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)331 radv_emit_sqtt_stop(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
332 {
333 const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
334 unsigned max_se = device->physical_device->rad_info.max_se;
335
336 radeon_check_space(device->ws, cs, 8 + max_se * 64);
337
338 /* Stop the thread trace with a different event based on the queue. */
339 if (qf == RADV_QUEUE_COMPUTE) {
340 radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0));
341 } else {
342 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
343 radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
344 }
345
346 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
347 radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
348
349 if (device->physical_device->rad_info.has_sqtt_rb_harvest_bug) {
350 /* Some chips with disabled RBs should wait for idle because FINISH_DONE doesn't work. */
351 radv_emit_wait_for_idle(device, cs, qf);
352 }
353
354 for (unsigned se = 0; se < max_se; se++) {
355 if (ac_sqtt_se_is_disabled(&device->physical_device->rad_info, se))
356 continue;
357
358 /* Target SEi and SH0. */
359 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
360 S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
361
362 if (device->physical_device->rad_info.gfx_level >= GFX11) {
363 /* Make sure to wait for the trace buffer. */
364 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
365 radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
366 radeon_emit(cs, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
367 radeon_emit(cs, 0);
368 radeon_emit(cs, 0); /* reference value */
369 radeon_emit(cs, ~C_0367D0_FINISH_DONE);
370 radeon_emit(cs, 4); /* poll interval */
371
372 /* Disable the thread trace mode. */
373 radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B0_SQ_THREAD_TRACE_CTRL, gfx11_get_sqtt_ctrl(device, false));
374
375 /* Wait for thread trace completion. */
376 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
377 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
378 radeon_emit(cs, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
379 radeon_emit(cs, 0);
380 radeon_emit(cs, 0); /* reference value */
381 radeon_emit(cs, ~C_0367D0_BUSY); /* mask */
382 radeon_emit(cs, 4); /* poll interval */
383 } else if (device->physical_device->rad_info.gfx_level >= GFX10) {
384 if (!device->physical_device->rad_info.has_sqtt_rb_harvest_bug) {
385 /* Make sure to wait for the trace buffer. */
386 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
387 radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
388 radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
389 radeon_emit(cs, 0);
390 radeon_emit(cs, 0); /* reference value */
391 radeon_emit(cs, ~C_008D20_FINISH_DONE);
392 radeon_emit(cs, 4); /* poll interval */
393 }
394
395 /* Disable the thread trace mode. */
396 radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, gfx10_get_sqtt_ctrl(device, false));
397
398 /* Wait for thread trace completion. */
399 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
400 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
401 radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
402 radeon_emit(cs, 0);
403 radeon_emit(cs, 0); /* reference value */
404 radeon_emit(cs, ~C_008D20_BUSY); /* mask */
405 radeon_emit(cs, 4); /* poll interval */
406 } else {
407 /* Disable the thread trace mode. */
408 radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
409
410 /* Wait for thread trace completion. */
411 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
412 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
413 radeon_emit(cs, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
414 radeon_emit(cs, 0);
415 radeon_emit(cs, 0); /* reference value */
416 radeon_emit(cs, ~C_030CE8_BUSY); /* mask */
417 radeon_emit(cs, 4); /* poll interval */
418 }
419
420 radv_copy_sqtt_info_regs(device, cs, se);
421 }
422
423 /* Restore global broadcasting. */
424 radeon_set_uconfig_reg(
425 cs, R_030800_GRBM_GFX_INDEX,
426 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
427 }
428
429 void
radv_emit_sqtt_userdata(const struct radv_cmd_buffer * cmd_buffer,const void * data,uint32_t num_dwords)430 radv_emit_sqtt_userdata(const struct radv_cmd_buffer *cmd_buffer, const void *data, uint32_t num_dwords)
431 {
432 const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
433 const enum radv_queue_family qf = cmd_buffer->qf;
434 struct radv_device *device = cmd_buffer->device;
435 struct radeon_cmdbuf *cs = cmd_buffer->cs;
436 const uint32_t *dwords = (uint32_t *)data;
437
438 /* SQTT user data packets aren't supported on SDMA queues. */
439 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
440 return;
441
442 while (num_dwords > 0) {
443 uint32_t count = MIN2(num_dwords, 2);
444
445 radeon_check_space(device->ws, cs, 2 + count);
446
447 /* Without the perfctr bit the CP might not always pass the
448 * write on correctly. */
449 if (device->physical_device->rad_info.gfx_level >= GFX10)
450 radeon_set_uconfig_reg_seq_perfctr(gfx_level, qf, cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
451 else
452 radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
453 radeon_emit_array(cs, dwords, count);
454
455 dwords += count;
456 num_dwords -= count;
457 }
458 }
459
460 void
radv_emit_spi_config_cntl(const struct radv_device * device,struct radeon_cmdbuf * cs,bool enable)461 radv_emit_spi_config_cntl(const struct radv_device *device, struct radeon_cmdbuf *cs, bool enable)
462 {
463 if (device->physical_device->rad_info.gfx_level >= GFX9) {
464 uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) |
465 S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable);
466
467 if (device->physical_device->rad_info.gfx_level >= GFX10)
468 spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
469
470 radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
471 } else {
472 /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
473 radeon_set_privileged_config_reg(cs, R_009100_SPI_CONFIG_CNTL,
474 S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable));
475 }
476 }
477
478 void
radv_emit_inhibit_clockgating(const struct radv_device * device,struct radeon_cmdbuf * cs,bool inhibit)479 radv_emit_inhibit_clockgating(const struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit)
480 {
481 if (device->physical_device->rad_info.gfx_level >= GFX11)
482 return; /* not needed */
483
484 if (device->physical_device->rad_info.gfx_level >= GFX10) {
485 radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL, S_037390_PERFMON_CLOCK_STATE(inhibit));
486 } else if (device->physical_device->rad_info.gfx_level >= GFX8) {
487 radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL, S_0372FC_PERFMON_CLOCK_STATE(inhibit));
488 }
489 }
490
491 VkResult
radv_sqtt_acquire_gpu_timestamp(struct radv_device * device,struct radeon_winsys_bo ** gpu_timestamp_bo,uint32_t * gpu_timestamp_offset,void ** gpu_timestamp_ptr)492 radv_sqtt_acquire_gpu_timestamp(struct radv_device *device, struct radeon_winsys_bo **gpu_timestamp_bo,
493 uint32_t *gpu_timestamp_offset, void **gpu_timestamp_ptr)
494 {
495 struct radeon_winsys *ws = device->ws;
496
497 simple_mtx_lock(&device->sqtt_timestamp_mtx);
498
499 if (device->sqtt_timestamp.offset + 8 > device->sqtt_timestamp.size) {
500 struct radeon_winsys_bo *bo;
501 uint64_t new_size;
502 VkResult result;
503 uint8_t *map;
504
505 new_size = MAX2(4096, 2 * device->sqtt_timestamp.size);
506
507 result = ws->buffer_create(ws, new_size, 8, RADEON_DOMAIN_GTT,
508 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_SCRATCH,
509 0, &bo);
510 if (result != VK_SUCCESS) {
511 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
512 return result;
513 }
514
515 map = device->ws->buffer_map(bo);
516 if (!map) {
517 ws->buffer_destroy(ws, bo);
518 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
519 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
520 }
521
522 if (device->sqtt_timestamp.bo) {
523 struct radv_sqtt_timestamp *new_timestamp;
524
525 new_timestamp = malloc(sizeof(*new_timestamp));
526 if (!new_timestamp) {
527 ws->buffer_destroy(ws, bo);
528 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
529 return VK_ERROR_OUT_OF_HOST_MEMORY;
530 }
531
532 memcpy(new_timestamp, &device->sqtt_timestamp, sizeof(*new_timestamp));
533 list_add(&new_timestamp->list, &device->sqtt_timestamp.list);
534 }
535
536 device->sqtt_timestamp.bo = bo;
537 device->sqtt_timestamp.size = new_size;
538 device->sqtt_timestamp.offset = 0;
539 device->sqtt_timestamp.map = map;
540 }
541
542 *gpu_timestamp_bo = device->sqtt_timestamp.bo;
543 *gpu_timestamp_offset = device->sqtt_timestamp.offset;
544 *gpu_timestamp_ptr = device->sqtt_timestamp.map + device->sqtt_timestamp.offset;
545
546 device->sqtt_timestamp.offset += 8;
547
548 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
549
550 return VK_SUCCESS;
551 }
552
553 static void
radv_sqtt_reset_timestamp(struct radv_device * device)554 radv_sqtt_reset_timestamp(struct radv_device *device)
555 {
556 struct radeon_winsys *ws = device->ws;
557
558 simple_mtx_lock(&device->sqtt_timestamp_mtx);
559
560 list_for_each_entry_safe (struct radv_sqtt_timestamp, ts, &device->sqtt_timestamp.list, list) {
561 ws->buffer_destroy(ws, ts->bo);
562 list_del(&ts->list);
563 free(ts);
564 }
565
566 device->sqtt_timestamp.offset = 0;
567
568 simple_mtx_unlock(&device->sqtt_timestamp_mtx);
569 }
570
571 static bool
radv_sqtt_init_queue_event(struct radv_device * device)572 radv_sqtt_init_queue_event(struct radv_device *device)
573 {
574 VkCommandPool cmd_pool;
575 VkResult result;
576
577 const VkCommandPoolCreateInfo create_gfx_info = {
578 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
579 .queueFamilyIndex = RADV_QUEUE_GENERAL, /* Graphics queue is always the first queue. */
580 };
581
582 result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_gfx_info, NULL, &cmd_pool);
583 if (result != VK_SUCCESS)
584 return false;
585
586 device->sqtt_command_pool[0] = vk_command_pool_from_handle(cmd_pool);
587
588 if (!(device->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
589 const VkCommandPoolCreateInfo create_comp_info = {
590 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
591 .queueFamilyIndex = RADV_QUEUE_COMPUTE,
592 };
593
594 result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_comp_info, NULL, &cmd_pool);
595 if (result != VK_SUCCESS)
596 return false;
597
598 device->sqtt_command_pool[1] = vk_command_pool_from_handle(cmd_pool);
599 }
600
601 simple_mtx_init(&device->sqtt_command_pool_mtx, mtx_plain);
602
603 simple_mtx_init(&device->sqtt_timestamp_mtx, mtx_plain);
604 list_inithead(&device->sqtt_timestamp.list);
605
606 return true;
607 }
608
609 static void
radv_sqtt_finish_queue_event(struct radv_device * device)610 radv_sqtt_finish_queue_event(struct radv_device *device)
611 {
612 struct radeon_winsys *ws = device->ws;
613
614 if (device->sqtt_timestamp.bo)
615 ws->buffer_destroy(ws, device->sqtt_timestamp.bo);
616
617 simple_mtx_destroy(&device->sqtt_timestamp_mtx);
618
619 for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++)
620 vk_common_DestroyCommandPool(radv_device_to_handle(device),
621 vk_command_pool_to_handle(device->sqtt_command_pool[i]), NULL);
622
623 simple_mtx_destroy(&device->sqtt_command_pool_mtx);
624 }
625
626 static bool
radv_sqtt_init_bo(struct radv_device * device)627 radv_sqtt_init_bo(struct radv_device *device)
628 {
629 unsigned max_se = device->physical_device->rad_info.max_se;
630 struct radeon_winsys *ws = device->ws;
631 VkResult result;
632 uint64_t size;
633
634 /* The buffer size and address need to be aligned in HW regs. Align the
635 * size as early as possible so that we do all the allocation & addressing
636 * correctly. */
637 device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
638
639 /* Compute total size of the thread trace BO for all SEs. */
640 size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
641 size += device->sqtt.buffer_size * (uint64_t)max_se;
642
643 struct radeon_winsys_bo *bo = NULL;
644 result = ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_VRAM,
645 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
646 RADV_BO_PRIORITY_SCRATCH, 0, &bo);
647 device->sqtt.bo = bo;
648 if (result != VK_SUCCESS)
649 return false;
650
651 result = ws->buffer_make_resident(ws, device->sqtt.bo, true);
652 if (result != VK_SUCCESS)
653 return false;
654
655 device->sqtt.ptr = ws->buffer_map(device->sqtt.bo);
656 if (!device->sqtt.ptr)
657 return false;
658
659 return true;
660 }
661
662 static void
radv_sqtt_finish_bo(struct radv_device * device)663 radv_sqtt_finish_bo(struct radv_device *device)
664 {
665 struct radeon_winsys *ws = device->ws;
666
667 if (unlikely(device->sqtt.bo)) {
668 ws->buffer_make_resident(ws, device->sqtt.bo, false);
669 ws->buffer_destroy(ws, device->sqtt.bo);
670 }
671 }
672
673 static VkResult
radv_register_queue(struct radv_device * device,struct radv_queue * queue)674 radv_register_queue(struct radv_device *device, struct radv_queue *queue)
675 {
676 struct ac_sqtt *sqtt = &device->sqtt;
677 struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
678 struct rgp_queue_info_record *record;
679
680 record = malloc(sizeof(struct rgp_queue_info_record));
681 if (!record)
682 return VK_ERROR_OUT_OF_HOST_MEMORY;
683
684 record->queue_id = (uintptr_t)queue;
685 record->queue_context = (uintptr_t)queue->hw_ctx;
686 if (queue->vk.queue_family_index == RADV_QUEUE_GENERAL) {
687 record->hardware_info.queue_type = SQTT_QUEUE_TYPE_UNIVERSAL;
688 record->hardware_info.engine_type = SQTT_ENGINE_TYPE_UNIVERSAL;
689 } else {
690 record->hardware_info.queue_type = SQTT_QUEUE_TYPE_COMPUTE;
691 record->hardware_info.engine_type = SQTT_ENGINE_TYPE_COMPUTE;
692 }
693
694 simple_mtx_lock(&queue_info->lock);
695 list_addtail(&record->list, &queue_info->record);
696 queue_info->record_count++;
697 simple_mtx_unlock(&queue_info->lock);
698
699 return VK_SUCCESS;
700 }
701
702 static void
radv_unregister_queue(struct radv_device * device,struct radv_queue * queue)703 radv_unregister_queue(struct radv_device *device, struct radv_queue *queue)
704 {
705 struct ac_sqtt *sqtt = &device->sqtt;
706 struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
707
708 /* Destroy queue info record. */
709 simple_mtx_lock(&queue_info->lock);
710 if (queue_info->record_count > 0) {
711 list_for_each_entry_safe (struct rgp_queue_info_record, record, &queue_info->record, list) {
712 if (record->queue_id == (uintptr_t)queue) {
713 queue_info->record_count--;
714 list_del(&record->list);
715 free(record);
716 break;
717 }
718 }
719 }
720 simple_mtx_unlock(&queue_info->lock);
721 }
722
723 static void
radv_register_queues(struct radv_device * device,struct ac_sqtt * sqtt)724 radv_register_queues(struct radv_device *device, struct ac_sqtt *sqtt)
725 {
726 if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
727 radv_register_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
728
729 for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
730 radv_register_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
731 }
732
733 static void
radv_unregister_queues(struct radv_device * device,struct ac_sqtt * sqtt)734 radv_unregister_queues(struct radv_device *device, struct ac_sqtt *sqtt)
735 {
736 if (device->queue_count[RADV_QUEUE_GENERAL] == 1)
737 radv_unregister_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
738
739 for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
740 radv_unregister_queue(device, &device->queues[RADV_QUEUE_COMPUTE][i]);
741 }
742
743 bool
radv_sqtt_init(struct radv_device * device)744 radv_sqtt_init(struct radv_device *device)
745 {
746 struct ac_sqtt *sqtt = &device->sqtt;
747
748 /* Default buffer size set to 32MB per SE. */
749 device->sqtt.buffer_size = (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024);
750
751 if (!radv_sqtt_init_bo(device))
752 return false;
753
754 if (!radv_sqtt_init_queue_event(device))
755 return false;
756
757 if (!radv_device_acquire_performance_counters(device))
758 return false;
759
760 ac_sqtt_init(sqtt);
761
762 radv_register_queues(device, sqtt);
763
764 return true;
765 }
766
767 void
radv_sqtt_finish(struct radv_device * device)768 radv_sqtt_finish(struct radv_device *device)
769 {
770 struct ac_sqtt *sqtt = &device->sqtt;
771 struct radeon_winsys *ws = device->ws;
772
773 radv_sqtt_finish_bo(device);
774 radv_sqtt_finish_queue_event(device);
775
776 for (unsigned i = 0; i < 2; i++) {
777 if (device->sqtt.start_cs[i])
778 ws->cs_destroy(device->sqtt.start_cs[i]);
779 if (device->sqtt.stop_cs[i])
780 ws->cs_destroy(device->sqtt.stop_cs[i]);
781 }
782
783 radv_unregister_queues(device, sqtt);
784
785 ac_sqtt_finish(sqtt);
786 }
787
788 static bool
radv_sqtt_resize_bo(struct radv_device * device)789 radv_sqtt_resize_bo(struct radv_device *device)
790 {
791 /* Destroy the previous thread trace BO. */
792 radv_sqtt_finish_bo(device);
793
794 /* Double the size of the thread trace buffer per SE. */
795 device->sqtt.buffer_size *= 2;
796
797 fprintf(stderr,
798 "Failed to get the thread trace because the buffer "
799 "was too small, resizing to %d KB\n",
800 device->sqtt.buffer_size / 1024);
801
802 /* Re-create the thread trace BO. */
803 return radv_sqtt_init_bo(device);
804 }
805
806 bool
radv_begin_sqtt(struct radv_queue * queue)807 radv_begin_sqtt(struct radv_queue *queue)
808 {
809 struct radv_device *device = queue->device;
810 enum radv_queue_family family = queue->state.qf;
811 struct radeon_winsys *ws = device->ws;
812 struct radeon_cmdbuf *cs;
813 VkResult result;
814
815 /* Destroy the previous start CS and create a new one. */
816 if (device->sqtt.start_cs[family]) {
817 ws->cs_destroy(device->sqtt.start_cs[family]);
818 device->sqtt.start_cs[family] = NULL;
819 }
820
821 cs = ws->cs_create(ws, radv_queue_ring(queue), false);
822 if (!cs)
823 return false;
824
825 radeon_check_space(ws, cs, 512);
826
827 switch (family) {
828 case RADV_QUEUE_GENERAL:
829 radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
830 radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
831 radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
832 break;
833 case RADV_QUEUE_COMPUTE:
834 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
835 radeon_emit(cs, 0);
836 break;
837 default:
838 unreachable("Incorrect queue family");
839 break;
840 }
841
842 /* Make sure to wait-for-idle before starting SQTT. */
843 radv_emit_wait_for_idle(device, cs, family);
844
845 /* Disable clock gating before starting SQTT. */
846 radv_emit_inhibit_clockgating(device, cs, true);
847
848 /* Enable SQG events that collects thread trace data. */
849 radv_emit_spi_config_cntl(device, cs, true);
850
851 radv_perfcounter_emit_spm_reset(cs);
852
853 if (device->spm.bo) {
854 /* Enable all shader stages by default. */
855 radv_perfcounter_emit_shaders(device, cs, ac_sqtt_get_shader_mask(&device->physical_device->rad_info));
856
857 radv_emit_spm_setup(device, cs, family);
858 }
859
860 /* Start SQTT. */
861 radv_emit_sqtt_start(device, cs, family);
862
863 if (device->spm.bo)
864 radv_perfcounter_emit_spm_start(device, cs, family);
865
866 result = ws->cs_finalize(cs);
867 if (result != VK_SUCCESS) {
868 ws->cs_destroy(cs);
869 return false;
870 }
871
872 device->sqtt.start_cs[family] = cs;
873
874 return radv_queue_internal_submit(queue, cs);
875 }
876
877 bool
radv_end_sqtt(struct radv_queue * queue)878 radv_end_sqtt(struct radv_queue *queue)
879 {
880 struct radv_device *device = queue->device;
881 enum radv_queue_family family = queue->state.qf;
882 struct radeon_winsys *ws = device->ws;
883 struct radeon_cmdbuf *cs;
884 VkResult result;
885
886 /* Destroy the previous stop CS and create a new one. */
887 if (queue->device->sqtt.stop_cs[family]) {
888 ws->cs_destroy(device->sqtt.stop_cs[family]);
889 device->sqtt.stop_cs[family] = NULL;
890 }
891
892 cs = ws->cs_create(ws, radv_queue_ring(queue), false);
893 if (!cs)
894 return false;
895
896 radeon_check_space(ws, cs, 512);
897
898 switch (family) {
899 case RADV_QUEUE_GENERAL:
900 radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
901 radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
902 radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
903 break;
904 case RADV_QUEUE_COMPUTE:
905 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
906 radeon_emit(cs, 0);
907 break;
908 default:
909 unreachable("Incorrect queue family");
910 break;
911 }
912
913 /* Make sure to wait-for-idle before stopping SQTT. */
914 radv_emit_wait_for_idle(device, cs, family);
915
916 if (device->spm.bo)
917 radv_perfcounter_emit_spm_stop(device, cs, family);
918
919 /* Stop SQTT. */
920 radv_emit_sqtt_stop(device, cs, family);
921
922 radv_perfcounter_emit_spm_reset(cs);
923
924 /* Restore previous state by disabling SQG events. */
925 radv_emit_spi_config_cntl(device, cs, false);
926
927 /* Restore previous state by re-enabling clock gating. */
928 radv_emit_inhibit_clockgating(device, cs, false);
929
930 result = ws->cs_finalize(cs);
931 if (result != VK_SUCCESS) {
932 ws->cs_destroy(cs);
933 return false;
934 }
935
936 device->sqtt.stop_cs[family] = cs;
937
938 return radv_queue_internal_submit(queue, cs);
939 }
940
941 bool
radv_get_sqtt_trace(struct radv_queue * queue,struct ac_sqtt_trace * sqtt_trace)942 radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_trace)
943 {
944 struct radv_device *device = queue->device;
945 const struct radeon_info *rad_info = &device->physical_device->rad_info;
946
947 if (!ac_sqtt_get_trace(&device->sqtt, rad_info, sqtt_trace)) {
948 if (!radv_sqtt_resize_bo(device))
949 fprintf(stderr, "radv: Failed to resize the SQTT buffer.\n");
950 return false;
951 }
952
953 return true;
954 }
955
956 void
radv_reset_sqtt_trace(struct radv_device * device)957 radv_reset_sqtt_trace(struct radv_device *device)
958 {
959 struct ac_sqtt *sqtt = &device->sqtt;
960 struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
961 struct rgp_queue_event *queue_event = &sqtt->rgp_queue_event;
962
963 /* Clear clock calibration records. */
964 simple_mtx_lock(&clock_calibration->lock);
965 list_for_each_entry_safe (struct rgp_clock_calibration_record, record, &clock_calibration->record, list) {
966 clock_calibration->record_count--;
967 list_del(&record->list);
968 free(record);
969 }
970 simple_mtx_unlock(&clock_calibration->lock);
971
972 /* Clear queue event records. */
973 simple_mtx_lock(&queue_event->lock);
974 list_for_each_entry_safe (struct rgp_queue_event_record, record, &queue_event->record, list) {
975 list_del(&record->list);
976 free(record);
977 }
978 queue_event->record_count = 0;
979 simple_mtx_unlock(&queue_event->lock);
980
981 /* Clear timestamps. */
982 radv_sqtt_reset_timestamp(device);
983
984 /* Clear timed cmdbufs. */
985 simple_mtx_lock(&device->sqtt_command_pool_mtx);
986 for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++) {
987 vk_common_TrimCommandPool(radv_device_to_handle(device), vk_command_pool_to_handle(device->sqtt_command_pool[i]),
988 0);
989 }
990 simple_mtx_unlock(&device->sqtt_command_pool_mtx);
991 }
992
993 static VkResult
radv_get_calibrated_timestamps(struct radv_device * device,uint64_t * cpu_timestamp,uint64_t * gpu_timestamp)994 radv_get_calibrated_timestamps(struct radv_device *device, uint64_t *cpu_timestamp, uint64_t *gpu_timestamp)
995 {
996 uint64_t timestamps[2];
997 uint64_t max_deviation;
998 VkResult result;
999
1000 const VkCalibratedTimestampInfoKHR timestamp_infos[2] = {{
1001 .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
1002 .timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
1003 },
1004 {
1005 .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
1006 .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR,
1007 }};
1008
1009 result =
1010 radv_GetCalibratedTimestampsKHR(radv_device_to_handle(device), 2, timestamp_infos, timestamps, &max_deviation);
1011 if (result != VK_SUCCESS)
1012 return result;
1013
1014 *cpu_timestamp = timestamps[0];
1015 *gpu_timestamp = timestamps[1];
1016
1017 return result;
1018 }
1019
1020 bool
radv_sqtt_sample_clocks(struct radv_device * device)1021 radv_sqtt_sample_clocks(struct radv_device *device)
1022 {
1023 uint64_t cpu_timestamp = 0, gpu_timestamp = 0;
1024 VkResult result;
1025
1026 result = radv_get_calibrated_timestamps(device, &cpu_timestamp, &gpu_timestamp);
1027 if (result != VK_SUCCESS)
1028 return false;
1029
1030 return ac_sqtt_add_clock_calibration(&device->sqtt, cpu_timestamp, gpu_timestamp);
1031 }
1032
1033 VkResult
radv_sqtt_get_timed_cmdbuf(struct radv_queue * queue,struct radeon_winsys_bo * timestamp_bo,uint32_t timestamp_offset,VkPipelineStageFlags2 timestamp_stage,VkCommandBuffer * pcmdbuf)1034 radv_sqtt_get_timed_cmdbuf(struct radv_queue *queue, struct radeon_winsys_bo *timestamp_bo, uint32_t timestamp_offset,
1035 VkPipelineStageFlags2 timestamp_stage, VkCommandBuffer *pcmdbuf)
1036 {
1037 struct radv_device *device = queue->device;
1038 enum radv_queue_family queue_family = queue->state.qf;
1039 VkCommandBuffer cmdbuf;
1040 uint64_t timestamp_va;
1041 VkResult result;
1042
1043 assert(queue_family == RADV_QUEUE_GENERAL || queue_family == RADV_QUEUE_COMPUTE);
1044
1045 simple_mtx_lock(&device->sqtt_command_pool_mtx);
1046
1047 const VkCommandBufferAllocateInfo alloc_info = {
1048 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
1049 .commandPool = vk_command_pool_to_handle(device->sqtt_command_pool[queue_family]),
1050 .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
1051 .commandBufferCount = 1,
1052 };
1053
1054 result = vk_common_AllocateCommandBuffers(radv_device_to_handle(device), &alloc_info, &cmdbuf);
1055 if (result != VK_SUCCESS)
1056 goto fail;
1057
1058 const VkCommandBufferBeginInfo begin_info = {
1059 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
1060 .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
1061 };
1062
1063 result = radv_BeginCommandBuffer(cmdbuf, &begin_info);
1064 if (result != VK_SUCCESS)
1065 goto fail;
1066
1067 radeon_check_space(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, 28);
1068
1069 timestamp_va = radv_buffer_get_va(timestamp_bo) + timestamp_offset;
1070
1071 radv_cs_add_buffer(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, timestamp_bo);
1072
1073 radv_write_timestamp(radv_cmd_buffer_from_handle(cmdbuf), timestamp_va, timestamp_stage);
1074
1075 result = radv_EndCommandBuffer(cmdbuf);
1076 if (result != VK_SUCCESS)
1077 goto fail;
1078
1079 *pcmdbuf = cmdbuf;
1080
1081 fail:
1082 simple_mtx_unlock(&device->sqtt_command_pool_mtx);
1083 return result;
1084 }
1085