1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "amd_family.h"
8 #include "si_build_pm4.h"
9 #include "si_pipe.h"
10
11 #include "tgsi/tgsi_from_mesa.h"
12 #include "util/hash_table.h"
13 #include "util/u_debug.h"
14 #include "util/u_memory.h"
15 #include "ac_rgp.h"
16 #include "ac_sqtt.h"
17
18 static void
19 si_emit_spi_config_cntl(struct si_context *sctx,
20 struct radeon_cmdbuf *cs, bool enable);
21
si_sqtt_init_bo(struct si_context * sctx)22 static bool si_sqtt_init_bo(struct si_context *sctx)
23 {
24 unsigned max_se = sctx->screen->info.max_se;
25 struct radeon_winsys *ws = sctx->ws;
26 uint64_t size;
27
28 /* The buffer size and address need to be aligned in HW regs. Align the
29 * size as early as possible so that we do all the allocation & addressing
30 * correctly. */
31 sctx->sqtt->buffer_size =
32 align64(sctx->sqtt->buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
33
34 /* Compute total size of the thread trace BO for all SEs. */
35 size = align64(sizeof(struct ac_sqtt_data_info) * max_se,
36 1 << SQTT_BUFFER_ALIGN_SHIFT);
37 size += sctx->sqtt->buffer_size * (uint64_t)max_se;
38
39 sctx->sqtt->pipeline_bos = _mesa_hash_table_u64_create(NULL);
40
41 sctx->sqtt->bo =
42 ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_VRAM,
43 RADEON_FLAG_NO_INTERPROCESS_SHARING |
44 RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_SUBALLOC);
45 if (!sctx->sqtt->bo)
46 return false;
47
48 return true;
49 }
50
si_emit_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)51 static void si_emit_sqtt_start(struct si_context *sctx,
52 struct radeon_cmdbuf *cs,
53 enum amd_ip_type ip_type)
54 {
55 struct si_screen *sscreen = sctx->screen;
56 uint32_t shifted_size = sctx->sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
57 unsigned max_se = sscreen->info.max_se;
58
59 radeon_begin(cs);
60
61 for (unsigned se = 0; se < max_se; se++) {
62 uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
63 uint64_t data_va =
64 ac_sqtt_get_data_va(&sctx->screen->info, sctx->sqtt, va, se);
65 uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
66
67 if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
68 continue;
69
70 /* Target SEx and SH0. */
71 radeon_set_uconfig_perfctr_reg_seq(R_030800_GRBM_GFX_INDEX, 1);
72 radeon_emit(S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) |
73 S_030800_INSTANCE_BROADCAST_WRITES(1));
74
75 /* Select the first active CUs */
76 int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
77
78 if (sctx->gfx_level >= GFX10) {
79 uint32_t token_mask =
80 V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC |
81 V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_CONTEXT |
82 V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONFIG;
83 int wgp = first_active_cu / 2;
84 unsigned shader_mask = 0x7f; /* all shader stages */
85
86 /* Order seems important for the following 2 registers. */
87 if (sctx->gfx_level >= GFX11) {
88 /* Disable unsupported hw shader stages */
89 shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
90
91 radeon_set_uconfig_perfctr_reg_seq(R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, 2);
92 radeon_emit(shifted_va);
93 radeon_emit(S_0367A4_SIZE(shifted_size) |
94 S_0367A4_BASE_HI(shifted_va >> 32));
95
96 radeon_set_uconfig_perfctr_reg_seq(R_0367B4_SQ_THREAD_TRACE_MASK, 2);
97 radeon_emit(S_0367B4_WTYPE_INCLUDE(shader_mask) |
98 S_0367B4_SA_SEL(0) | S_0367B4_WGP_SEL(wgp) |
99 S_0367B4_SIMD_SEL(0));
100 radeon_emit(S_0367B8_REG_INCLUDE(token_mask) |
101 S_0367B8_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
102 } else {
103 radeon_set_privileged_config_reg(
104 R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
105 S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
106
107 radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE,
108 shifted_va);
109
110 radeon_set_privileged_config_reg(
111 R_008D14_SQ_THREAD_TRACE_MASK,
112 S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) |
113 S_008D14_WGP_SEL(wgp) | S_008D14_SIMD_SEL(0));
114
115 radeon_set_privileged_config_reg(
116 R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
117 S_008D18_REG_INCLUDE(token_mask) |
118 S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
119 }
120
121 /* Should be emitted last (it enables thread traces). */
122 uint32_t ctrl = S_008D1C_MODE(1) | S_008D1C_HIWATER(5) |
123 S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) /* 4096 clk */ |
124 S_008D1C_DRAW_EVENT_EN(1);
125
126 if (sctx->gfx_level == GFX10_3)
127 ctrl |= S_008D1C_LOWATER_OFFSET(4);
128
129 ctrl |= S_008D1C_AUTO_FLUSH_MODE(
130 sctx->screen->info.has_sqtt_auto_flush_mode_bug);
131
132 switch (sctx->gfx_level) {
133 case GFX10:
134 case GFX10_3:
135 ctrl |= S_008D1C_REG_STALL_EN(1) |
136 S_008D1C_SPI_STALL_EN(1) |
137 S_008D1C_SQ_STALL_EN(1) |
138 S_008D1C_REG_DROP_ON_STALL(0);
139 radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, ctrl);
140 break;
141 case GFX11:
142 ctrl |= S_0367B0_SPI_STALL_EN(1) |
143 S_0367B0_SQ_STALL_EN(1) |
144 S_0367B0_REG_AT_HWM(2);
145 radeon_set_uconfig_perfctr_reg_seq(R_0367B0_SQ_THREAD_TRACE_CTRL, 1);
146 radeon_emit(ctrl);
147 break;
148 default:
149 assert(false);
150 }
151 } else {
152 /* Order seems important for the following 4 registers. */
153 radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2,
154 S_030CDC_ADDR_HI(shifted_va >> 32));
155
156 radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
157
158 radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE,
159 S_030CC4_SIZE(shifted_size));
160
161 radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL,
162 S_030CD4_RESET_BUFFER(1));
163
164 uint32_t sqtt_mask = S_030CC8_CU_SEL(first_active_cu) |
165 S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) |
166 S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) |
167 S_030CC8_SPI_STALL_EN(1) | S_030CC8_SQ_STALL_EN(1);
168
169 radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
170
171 /* Trace all tokens and registers. */
172 radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
173 S_030CCC_TOKEN_MASK(0xbfff) |
174 S_030CCC_REG_MASK(0xff) |
175 S_030CCC_REG_DROP_ON_STALL(0));
176
177 /* Enable SQTT perf counters for all CUs. */
178 radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
179 S_030CD0_SH0_MASK(0xffff) |
180 S_030CD0_SH1_MASK(0xffff));
181
182 radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
183
184 radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER,
185 S_030CEC_HIWATER(4));
186
187 if (sctx->gfx_level == GFX9) {
188 /* Reset thread trace status errors. */
189 radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS,
190 S_030CE8_UTC_ERROR(0));
191 }
192
193 /* Enable the thread trace mode. */
194 uint32_t sqtt_mode = S_030CD8_MASK_PS(1) |
195 S_030CD8_MASK_VS(1) |
196 S_030CD8_MASK_GS(1) |
197 S_030CD8_MASK_ES(1) |
198 S_030CD8_MASK_HS(1) |
199 S_030CD8_MASK_LS(1) |
200 S_030CD8_MASK_CS(1) |
201 S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
202 S_030CD8_MODE(1);
203
204 if (sctx->gfx_level == GFX9) {
205 /* Count SQTT traffic in TCC perf counters. */
206 sqtt_mode |= S_030CD8_TC_PERF_EN(1);
207 }
208
209 radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
210 }
211 }
212
213 /* Restore global broadcasting. */
214 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
215 S_030800_SE_BROADCAST_WRITES(1) |
216 S_030800_SH_BROADCAST_WRITES(1) |
217 S_030800_INSTANCE_BROADCAST_WRITES(1));
218
219 /* Start the thread trace with a different event based on the queue. */
220 if (ip_type == AMD_IP_COMPUTE) {
221 radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
222 S_00B878_THREAD_TRACE_ENABLE(1));
223 } else {
224 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
225 radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
226 }
227 radeon_end();
228 }
229
230 static const uint32_t gfx9_sqtt_info_regs[] = {
231 R_030CE4_SQ_THREAD_TRACE_WPTR,
232 R_030CE8_SQ_THREAD_TRACE_STATUS,
233 R_030CF0_SQ_THREAD_TRACE_CNTR,
234 };
235
236 static const uint32_t gfx10_sqtt_info_regs[] = {
237 R_008D10_SQ_THREAD_TRACE_WPTR,
238 R_008D20_SQ_THREAD_TRACE_STATUS,
239 R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
240 };
241
242 static const uint32_t gfx11_sqtt_info_regs[] = {
243 R_0367BC_SQ_THREAD_TRACE_WPTR,
244 R_0367D0_SQ_THREAD_TRACE_STATUS,
245 R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
246 };
247
si_copy_sqtt_info_regs(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned se_index)248 static void si_copy_sqtt_info_regs(struct si_context *sctx,
249 struct radeon_cmdbuf *cs,
250 unsigned se_index)
251 {
252 const uint32_t *sqtt_info_regs = NULL;
253
254 switch (sctx->gfx_level) {
255 case GFX10_3:
256 case GFX10:
257 sqtt_info_regs = gfx10_sqtt_info_regs;
258 break;
259 case GFX11:
260 sqtt_info_regs = gfx11_sqtt_info_regs;
261 break;
262 case GFX9:
263 sqtt_info_regs = gfx9_sqtt_info_regs;
264 break;
265 default:
266 unreachable("Unsupported gfx_level");
267 }
268
269 /* Get the VA where the info struct is stored for this SE. */
270 uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
271 uint64_t info_va = ac_sqtt_get_info_va(va, se_index);
272
273 radeon_begin(cs);
274
275 /* Copy back the info struct one DWORD at a time. */
276 for (unsigned i = 0; i < 3; i++) {
277 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
278 radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
279 COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM);
280 radeon_emit(sqtt_info_regs[i] >> 2);
281 radeon_emit(0); /* unused */
282 radeon_emit((info_va + i * 4));
283 radeon_emit((info_va + i * 4) >> 32);
284 }
285
286 if (sctx->gfx_level == GFX11) {
287 /* On GFX11, WPTR is incremented from the offset of the current buffer base
288 * address and it needs to be subtracted to get the correct offset:
289 *
290 * 1) get the current buffer base address for this SE
291 * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
292 * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
293 */
294 uint64_t data_va =
295 ac_sqtt_get_data_va(&sctx->screen->info, sctx->sqtt, va, se_index);
296 uint64_t shifted_data_va = (data_va >> 5);
297 uint64_t init_wptr_value = shifted_data_va & 0x1fffffff;
298
299 radeon_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0));
300 radeon_emit(ATOMIC_OP(TC_OP_ATOMIC_SUB_32));
301 radeon_emit(info_va);
302 radeon_emit(info_va >> 32);
303 radeon_emit(init_wptr_value);
304 radeon_emit(init_wptr_value >> 32);
305 radeon_emit(0);
306 radeon_emit(0);
307 radeon_emit(0);
308 }
309
310 radeon_end();
311 }
312
si_emit_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)313 static void si_emit_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs,
314 enum amd_ip_type ip_type)
315 {
316 unsigned max_se = sctx->screen->info.max_se;
317 radeon_begin(cs);
318
319 /* Stop the thread trace with a different event based on the queue. */
320 if (ip_type == AMD_IP_COMPUTE) {
321 radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
322 S_00B878_THREAD_TRACE_ENABLE(0));
323 } else {
324 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
325 radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
326 }
327
328 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
329 radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
330 radeon_end();
331
332 if (sctx->screen->info.has_sqtt_rb_harvest_bug) {
333 /* Some chips with disabled RBs should wait for idle because FINISH_DONE
334 * doesn't work. */
335 sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |
336 SI_CONTEXT_CS_PARTIAL_FLUSH;
337 sctx->emit_cache_flush(sctx, cs);
338 }
339
340 for (unsigned se = 0; se < max_se; se++) {
341 if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
342 continue;
343
344 radeon_begin(cs);
345
346 /* Target SEi and SH0. */
347 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
348 S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) |
349 S_030800_INSTANCE_BROADCAST_WRITES(1));
350
351 if (sctx->gfx_level >= GFX10) {
352 uint32_t tt_status_reg = sctx->gfx_level >= GFX11 ?
353 R_0367D0_SQ_THREAD_TRACE_STATUS:
354 R_008D20_SQ_THREAD_TRACE_STATUS;
355 if (!sctx->screen->info.has_sqtt_rb_harvest_bug) {
356 /* Make sure to wait for the trace buffer. */
357 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
358 radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal
359 to the reference value */
360 radeon_emit(tt_status_reg >> 2); /* register */
361 radeon_emit(0);
362 radeon_emit(0); /* reference value */
363 radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_FINISH_DONE:
364 ~C_008D20_FINISH_DONE); /* mask */
365 radeon_emit(4); /* poll interval */
366 }
367
368 /* Disable the thread trace mode. */
369 if (sctx->gfx_level >= GFX11) {
370 radeon_set_uconfig_perfctr_reg_seq(R_0367B0_SQ_THREAD_TRACE_CTRL, 1);
371 radeon_emit(S_008D1C_MODE(0));
372 } else {
373 radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
374 S_008D1C_MODE(0));
375 }
376
377 /* Wait for thread trace completion. */
378 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
379 radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to
380 the reference value */
381 radeon_emit(tt_status_reg >> 2); /* register */
382 radeon_emit(0);
383 radeon_emit(0); /* reference value */
384 radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_BUSY:
385 ~C_008D20_BUSY); /* mask */
386 radeon_emit(4); /* poll interval */
387 } else {
388 /* Disable the thread trace mode. */
389 radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
390
391 /* Wait for thread trace completion. */
392 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
393 radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to
394 the reference value */
395 radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
396 radeon_emit(0);
397 radeon_emit(0); /* reference value */
398 radeon_emit(~C_030CE8_BUSY); /* mask */
399 radeon_emit(4); /* poll interval */
400 }
401 radeon_end();
402
403 si_copy_sqtt_info_regs(sctx, cs, se);
404 }
405
406 /* Restore global broadcasting. */
407 radeon_begin_again(cs);
408 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
409 S_030800_SE_BROADCAST_WRITES(1) |
410 S_030800_SH_BROADCAST_WRITES(1) |
411 S_030800_INSTANCE_BROADCAST_WRITES(1));
412 radeon_end();
413 }
414
si_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs)415 static void si_sqtt_start(struct si_context *sctx, struct radeon_cmdbuf *cs)
416 {
417 struct radeon_winsys *ws = sctx->ws;
418 enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
419
420 radeon_begin(cs);
421
422 switch (ip_type) {
423 case AMD_IP_GFX:
424 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
425 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
426 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
427 break;
428 case AMD_IP_COMPUTE:
429 radeon_emit(PKT3(PKT3_NOP, 0, 0));
430 radeon_emit(0);
431 break;
432 default:
433 /* Unsupported. */
434 assert(false);
435 }
436 radeon_end();
437
438 ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
439 RADEON_DOMAIN_VRAM);
440 if (sctx->spm.bo)
441 ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
442 RADEON_DOMAIN_VRAM);
443
444 si_cp_dma_wait_for_idle(sctx, cs);
445
446 /* Make sure to wait-for-idle before starting SQTT. */
447 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
448 SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE |
449 SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 |
450 SI_CONTEXT_PFP_SYNC_ME;
451 sctx->emit_cache_flush(sctx, cs);
452
453 si_inhibit_clockgating(sctx, cs, true);
454
455 /* Enable SQG events that collects thread trace data. */
456 si_emit_spi_config_cntl(sctx, cs, true);
457
458 if (sctx->spm.bo) {
459 si_pc_emit_spm_reset(cs);
460 si_pc_emit_shaders(cs, 0x7f);
461 si_emit_spm_setup(sctx, cs);
462 }
463
464 si_emit_sqtt_start(sctx, cs, ip_type);
465
466 if (sctx->spm.bo)
467 si_pc_emit_spm_start(cs);
468 }
469
si_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs)470 static void si_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs)
471 {
472 struct radeon_winsys *ws = sctx->ws;
473 enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
474
475 radeon_begin(cs);
476
477 switch (ip_type) {
478 case AMD_IP_GFX:
479 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
480 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
481 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
482 break;
483 case AMD_IP_COMPUTE:
484 radeon_emit(PKT3(PKT3_NOP, 0, 0));
485 radeon_emit(0);
486 break;
487 default:
488 /* Unsupported. */
489 assert(false);
490 }
491 radeon_end();
492
493 ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
494 RADEON_DOMAIN_VRAM);
495
496 if (sctx->spm.bo)
497 ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
498 RADEON_DOMAIN_VRAM);
499
500 si_cp_dma_wait_for_idle(sctx, cs);
501
502 if (sctx->spm.bo)
503 si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters,
504 sctx->screen->info.never_send_perfcounter_stop);
505
506 /* Make sure to wait-for-idle before stopping SQTT. */
507 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
508 SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE |
509 SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 |
510 SI_CONTEXT_PFP_SYNC_ME;
511 sctx->emit_cache_flush(sctx, cs);
512
513 si_emit_sqtt_stop(sctx, cs, ip_type);
514
515 if (sctx->spm.bo)
516 si_pc_emit_spm_reset(cs);
517
518 /* Restore previous state by disabling SQG events. */
519 si_emit_spi_config_cntl(sctx, cs, false);
520
521 si_inhibit_clockgating(sctx, cs, false);
522 }
523
si_sqtt_init_cs(struct si_context * sctx)524 static void si_sqtt_init_cs(struct si_context *sctx)
525 {
526 struct radeon_winsys *ws = sctx->ws;
527
528 for (unsigned i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
529 sctx->sqtt->start_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
530 if (!ws->cs_create(sctx->sqtt->start_cs[i], sctx->ctx, (enum amd_ip_type)i,
531 NULL, NULL)) {
532 free(sctx->sqtt->start_cs[i]);
533 sctx->sqtt->start_cs[i] = NULL;
534 return;
535 }
536 si_sqtt_start(sctx, sctx->sqtt->start_cs[i]);
537
538 sctx->sqtt->stop_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
539 if (!ws->cs_create(sctx->sqtt->stop_cs[i], sctx->ctx, (enum amd_ip_type)i,
540 NULL, NULL)) {
541 ws->cs_destroy(sctx->sqtt->start_cs[i]);
542 free(sctx->sqtt->start_cs[i]);
543 sctx->sqtt->start_cs[i] = NULL;
544 free(sctx->sqtt->stop_cs[i]);
545 sctx->sqtt->stop_cs[i] = NULL;
546 return;
547 }
548
549 si_sqtt_stop(sctx, sctx->sqtt->stop_cs[i]);
550 }
551 }
552
si_begin_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)553 static void si_begin_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
554 {
555 struct radeon_cmdbuf *cs = sctx->sqtt->start_cs[sctx->ws->cs_get_ip_type(rcs)];
556 sctx->ws->cs_flush(cs, 0, NULL);
557 }
558
si_end_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)559 static void si_end_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
560 {
561 struct radeon_cmdbuf *cs = sctx->sqtt->stop_cs[sctx->ws->cs_get_ip_type(rcs)];
562 sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
563 }
564
si_get_sqtt_trace(struct si_context * sctx,struct ac_sqtt_trace * sqtt)565 static bool si_get_sqtt_trace(struct si_context *sctx,
566 struct ac_sqtt_trace *sqtt)
567 {
568 unsigned max_se = sctx->screen->info.max_se;
569
570 memset(sqtt, 0, sizeof(*sqtt));
571
572 sctx->sqtt->ptr =
573 sctx->ws->buffer_map(sctx->ws, sctx->sqtt->bo, NULL, PIPE_MAP_READ);
574
575 if (!sctx->sqtt->ptr)
576 return false;
577
578 if (!ac_sqtt_get_trace(sctx->sqtt, &sctx->screen->info, sqtt)) {
579 void *sqtt_ptr = sctx->sqtt->ptr;
580
581 for (unsigned se = 0; se < max_se; se++) {
582 uint64_t info_offset = ac_sqtt_get_info_offset(se);
583 void *info_ptr = sqtt_ptr + info_offset;
584 struct ac_sqtt_data_info *info = (struct ac_sqtt_data_info *)info_ptr;
585
586 if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
587 continue;
588
589 if (!ac_is_sqtt_complete(&sctx->screen->info, sctx->sqtt, info)) {
590 uint32_t expected_size =
591 ac_get_expected_buffer_size(&sctx->screen->info, info);
592 uint32_t available_size = (info->cur_offset * 32) / 1024;
593
594 fprintf(stderr,
595 "Failed to get the thread trace "
596 "because the buffer is too small. The "
597 "hardware needs %d KB but the "
598 "buffer size is %d KB.\n",
599 expected_size, available_size);
600 fprintf(stderr, "Please update the buffer size with "
601 "AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n");
602 return false;
603 }
604 }
605 }
606
607 return true;
608 }
609
si_init_sqtt(struct si_context * sctx)610 bool si_init_sqtt(struct si_context *sctx)
611 {
612 static bool warn_once = true;
613 if (warn_once) {
614 fprintf(stderr, "*************************************************\n");
615 fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
616 fprintf(stderr, "*************************************************\n");
617 warn_once = false;
618 }
619
620 sctx->sqtt = CALLOC_STRUCT(ac_sqtt);
621
622 if (sctx->gfx_level < GFX8) {
623 fprintf(stderr, "GPU hardware not supported: refer to "
624 "the RGP documentation for the list of "
625 "supported GPUs!\n");
626 return false;
627 }
628
629 if (sctx->gfx_level > GFX11) {
630 fprintf(stderr, "radeonsi: Thread trace is not supported "
631 "for that GPU!\n");
632 return false;
633 }
634
635 /* Default buffer size set to 32MB per SE. */
636 sctx->sqtt->buffer_size =
637 debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
638 sctx->sqtt->start_frame = 10;
639
640 const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
641 if (trigger) {
642 sctx->sqtt->start_frame = atoi(trigger);
643 if (sctx->sqtt->start_frame <= 0) {
644 /* This isn't a frame number, must be a file */
645 sctx->sqtt->trigger_file = strdup(trigger);
646 sctx->sqtt->start_frame = -1;
647 }
648 }
649
650 if (!si_sqtt_init_bo(sctx))
651 return false;
652
653 ac_sqtt_init(sctx->sqtt);
654
655 if (sctx->gfx_level >= GFX10 &&
656 debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) {
657 /* Limit SPM counters to GFX10 and GFX10_3 for now */
658 ASSERTED bool r = si_spm_init(sctx);
659 assert(r);
660 }
661
662 si_sqtt_init_cs(sctx);
663
664 sctx->sqtt_next_event = EventInvalid;
665
666 return true;
667 }
668
si_destroy_sqtt(struct si_context * sctx)669 void si_destroy_sqtt(struct si_context *sctx)
670 {
671 struct si_screen *sscreen = sctx->screen;
672 struct pb_buffer_lean *bo = sctx->sqtt->bo;
673 radeon_bo_reference(sctx->screen->ws, &bo, NULL);
674
675 if (sctx->sqtt->trigger_file)
676 free(sctx->sqtt->trigger_file);
677
678 for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
679 sscreen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
680 sscreen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
681 }
682
683 struct rgp_pso_correlation *pso_correlation =
684 &sctx->sqtt->rgp_pso_correlation;
685 struct rgp_loader_events *loader_events = &sctx->sqtt->rgp_loader_events;
686 struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
687 list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
688 &pso_correlation->record, list) {
689 list_del(&record->list);
690 pso_correlation->record_count--;
691 free(record);
692 }
693
694 list_for_each_entry_safe (struct rgp_loader_events_record, record,
695 &loader_events->record, list) {
696 list_del(&record->list);
697 loader_events->record_count--;
698 free(record);
699 }
700
701 list_for_each_entry_safe (struct rgp_code_object_record, record,
702 &code_object->record, list) {
703 uint32_t mask = record->shader_stages_mask;
704 int i;
705
706 /* Free the disassembly. */
707 while (mask) {
708 i = u_bit_scan(&mask);
709 free(record->shader_data[i].code);
710 }
711 list_del(&record->list);
712 free(record);
713 code_object->record_count--;
714 }
715
716 ac_sqtt_finish(sctx->sqtt);
717
718 hash_table_foreach (sctx->sqtt->pipeline_bos->table, entry) {
719 struct si_sqtt_fake_pipeline *pipeline =
720 (struct si_sqtt_fake_pipeline *)entry->data;
721 si_resource_reference(&pipeline->bo, NULL);
722 FREE(pipeline);
723 }
724
725 free(sctx->sqtt);
726 sctx->sqtt = NULL;
727
728 if (sctx->spm.bo)
729 si_spm_finish(sctx);
730 }
731
732 static uint64_t num_frames = 0;
733
si_handle_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)734 void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
735 {
736 /* Should we enable SQTT yet? */
737 if (!sctx->sqtt_enabled) {
738 bool frame_trigger = num_frames == sctx->sqtt->start_frame;
739 bool file_trigger = false;
740 if (sctx->sqtt->trigger_file &&
741 access(sctx->sqtt->trigger_file, W_OK) == 0) {
742 if (unlink(sctx->sqtt->trigger_file) == 0) {
743 file_trigger = true;
744 } else {
745 /* Do not enable tracing if we cannot remove the file,
746 * because by then we'll trace every frame.
747 */
748 fprintf(stderr, "radeonsi: could not remove thread "
749 "trace trigger file, ignoring\n");
750 }
751 }
752
753 if (frame_trigger || file_trigger) {
754 /* Wait for last submission */
755 sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence,
756 OS_TIMEOUT_INFINITE);
757
758 /* Start SQTT */
759 si_begin_sqtt(sctx, rcs);
760
761 sctx->sqtt_enabled = true;
762 sctx->sqtt->start_frame = -1;
763
764 /* Force shader update to make sure si_sqtt_describe_pipeline_bind is
765 * called for the current "pipeline".
766 */
767 sctx->do_update_shaders = true;
768 }
769 } else {
770 struct ac_sqtt_trace sqtt_trace = {0};
771
772 /* Stop SQTT */
773 si_end_sqtt(sctx, rcs);
774 sctx->sqtt_enabled = false;
775 sctx->sqtt->start_frame = -1;
776 assert(sctx->last_sqtt_fence);
777
778 /* Wait for SQTT to finish and read back the bo */
779 if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence,
780 OS_TIMEOUT_INFINITE) &&
781 si_get_sqtt_trace(sctx, &sqtt_trace)) {
782 struct ac_spm_trace spm_trace;
783
784 /* Map the SPM counter buffer */
785 if (sctx->spm.bo) {
786 sctx->spm.ptr = sctx->ws->buffer_map(
787 sctx->ws, sctx->spm.bo, NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY);
788 ac_spm_get_trace(&sctx->spm, &spm_trace);
789 }
790
791 ac_dump_rgp_capture(&sctx->screen->info, &sqtt_trace,
792 sctx->spm.bo ? &spm_trace : NULL);
793
794 if (sctx->spm.ptr)
795 sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo);
796 } else {
797 fprintf(stderr, "Failed to read the trace\n");
798 }
799 }
800
801 num_frames++;
802 }
803
si_emit_sqtt_userdata(struct si_context * sctx,struct radeon_cmdbuf * cs,const void * data,uint32_t num_dwords)804 static void si_emit_sqtt_userdata(struct si_context *sctx,
805 struct radeon_cmdbuf *cs, const void *data,
806 uint32_t num_dwords)
807 {
808 const uint32_t *dwords = (uint32_t *)data;
809
810 radeon_begin(cs);
811
812 while (num_dwords > 0) {
813 uint32_t count = MIN2(num_dwords, 2);
814
815 radeon_set_uconfig_perfctr_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
816 radeon_emit_array(dwords, count);
817
818 dwords += count;
819 num_dwords -= count;
820 }
821 radeon_end();
822 }
823
824 static void
si_emit_spi_config_cntl(struct si_context * sctx,struct radeon_cmdbuf * cs,bool enable)825 si_emit_spi_config_cntl(struct si_context *sctx,
826 struct radeon_cmdbuf *cs, bool enable)
827 {
828 radeon_begin(cs);
829
830 if (sctx->gfx_level >= GFX9) {
831 uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
832 S_031100_EXP_PRIORITY_ORDER(3) |
833 S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
834 S_031100_ENABLE_SQG_BOP_EVENTS(enable);
835
836 if (sctx->gfx_level >= GFX10)
837 spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
838
839 radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
840 } else {
841 /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
842 radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
843 S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
844 S_009100_ENABLE_SQG_BOP_EVENTS(enable));
845 }
846 radeon_end();
847 }
848
849 static uint32_t num_events = 0;
si_sqtt_write_event_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t vertex_offset_user_data,uint32_t instance_offset_user_data,uint32_t draw_index_user_data)850 void si_sqtt_write_event_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
851 enum rgp_sqtt_marker_event_type api_type,
852 uint32_t vertex_offset_user_data,
853 uint32_t instance_offset_user_data,
854 uint32_t draw_index_user_data)
855 {
856 struct rgp_sqtt_marker_event marker = {0};
857
858 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
859 marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
860 marker.cmd_id = num_events++;
861 marker.cb_id = 0;
862
863 if (vertex_offset_user_data == UINT_MAX ||
864 instance_offset_user_data == UINT_MAX) {
865 vertex_offset_user_data = 0;
866 instance_offset_user_data = 0;
867 }
868
869 if (draw_index_user_data == UINT_MAX)
870 draw_index_user_data = vertex_offset_user_data;
871
872 marker.vertex_offset_reg_idx = vertex_offset_user_data;
873 marker.instance_offset_reg_idx = instance_offset_user_data;
874 marker.draw_index_reg_idx = draw_index_user_data;
875
876 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
877
878 sctx->sqtt_next_event = EventInvalid;
879 }
880
si_write_event_with_dims_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t x,uint32_t y,uint32_t z)881 void si_write_event_with_dims_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
882 enum rgp_sqtt_marker_event_type api_type,
883 uint32_t x, uint32_t y, uint32_t z)
884 {
885 struct rgp_sqtt_marker_event_with_dims marker = {0};
886
887 marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
888 marker.event.api_type = api_type;
889 marker.event.cmd_id = num_events++;
890 marker.event.cb_id = 0;
891 marker.event.has_thread_dims = 1;
892
893 marker.thread_x = x;
894 marker.thread_y = y;
895 marker.thread_z = z;
896
897 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
898 sctx->sqtt_next_event = EventInvalid;
899 }
900
si_sqtt_describe_barrier_start(struct si_context * sctx,struct radeon_cmdbuf * rcs)901 void si_sqtt_describe_barrier_start(struct si_context *sctx, struct radeon_cmdbuf *rcs)
902 {
903 struct rgp_sqtt_marker_barrier_start marker = {0};
904
905 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
906 marker.cb_id = 0;
907 marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
908
909 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
910 }
911
si_sqtt_describe_barrier_end(struct si_context * sctx,struct radeon_cmdbuf * rcs,unsigned flags)912 void si_sqtt_describe_barrier_end(struct si_context *sctx, struct radeon_cmdbuf *rcs,
913 unsigned flags)
914 {
915 struct rgp_sqtt_marker_barrier_end marker = {0};
916
917 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
918 marker.cb_id = 0;
919
920 if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH)
921 marker.vs_partial_flush = true;
922 if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH)
923 marker.ps_partial_flush = true;
924 if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH)
925 marker.cs_partial_flush = true;
926
927 if (flags & SI_CONTEXT_PFP_SYNC_ME)
928 marker.pfp_sync_me = true;
929
930 if (flags & SI_CONTEXT_INV_VCACHE)
931 marker.inval_tcp = true;
932 if (flags & SI_CONTEXT_INV_ICACHE)
933 marker.inval_sqI = true;
934 if (flags & SI_CONTEXT_INV_SCACHE)
935 marker.inval_sqK = true;
936 if (flags & SI_CONTEXT_INV_L2)
937 marker.inval_tcc = true;
938
939 if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
940 marker.inval_cb = true;
941 marker.flush_cb = true;
942 }
943 if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
944 marker.inval_db = true;
945 marker.flush_db = true;
946 }
947
948 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
949 }
950
si_write_user_event(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_user_event_type type,const char * str,int len)951 void si_write_user_event(struct si_context *sctx, struct radeon_cmdbuf *rcs,
952 enum rgp_sqtt_marker_user_event_type type,
953 const char *str, int len)
954 {
955 if (type == UserEventPop) {
956 assert(str == NULL);
957 struct rgp_sqtt_marker_user_event marker = {0};
958 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
959 marker.data_type = type;
960
961 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
962 } else {
963 assert(str != NULL);
964 struct rgp_sqtt_marker_user_event_with_length marker = {0};
965 marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
966 marker.user_event.data_type = type;
967 len = MIN2(1024, len);
968 marker.length = align(len, 4);
969
970 uint8_t *buffer = alloca(sizeof(marker) + marker.length);
971 memcpy(buffer, &marker, sizeof(marker));
972 memcpy(buffer + sizeof(marker), str, len);
973 buffer[sizeof(marker) + len - 1] = '\0';
974
975 si_emit_sqtt_userdata(sctx, rcs, buffer,
976 sizeof(marker) / 4 + marker.length / 4);
977 }
978 }
979
si_sqtt_pipeline_is_registered(struct ac_sqtt * sqtt,uint64_t pipeline_hash)980 bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt,
981 uint64_t pipeline_hash)
982 {
983 simple_mtx_lock(&sqtt->rgp_pso_correlation.lock);
984 list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
985 &sqtt->rgp_pso_correlation.record, list) {
986 if (record->pipeline_hash[0] == pipeline_hash) {
987 simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
988 return true;
989 }
990 }
991 simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
992
993 return false;
994 }
995
996 static enum rgp_hardware_stages
si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key * key,enum pipe_shader_type stage)997 si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key *key, enum pipe_shader_type stage)
998 {
999 switch (stage) {
1000 case PIPE_SHADER_VERTEX:
1001 if (key->ge.as_ls)
1002 return RGP_HW_STAGE_LS;
1003 else if (key->ge.as_es)
1004 return RGP_HW_STAGE_ES;
1005 else if (key->ge.as_ngg)
1006 return RGP_HW_STAGE_GS;
1007 else
1008 return RGP_HW_STAGE_VS;
1009 case PIPE_SHADER_TESS_CTRL:
1010 return RGP_HW_STAGE_HS;
1011 case PIPE_SHADER_TESS_EVAL:
1012 if (key->ge.as_es)
1013 return RGP_HW_STAGE_ES;
1014 else if (key->ge.as_ngg)
1015 return RGP_HW_STAGE_GS;
1016 else
1017 return RGP_HW_STAGE_VS;
1018 case PIPE_SHADER_GEOMETRY:
1019 return RGP_HW_STAGE_GS;
1020 case PIPE_SHADER_FRAGMENT:
1021 return RGP_HW_STAGE_PS;
1022 case PIPE_SHADER_COMPUTE:
1023 return RGP_HW_STAGE_CS;
1024 default:
1025 unreachable("invalid mesa shader stage");
1026 }
1027 }
1028
1029 static bool
si_sqtt_add_code_object(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,bool is_compute)1030 si_sqtt_add_code_object(struct si_context *sctx,
1031 struct si_sqtt_fake_pipeline *pipeline,
1032 bool is_compute)
1033 {
1034 struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
1035 struct rgp_code_object_record *record;
1036
1037 record = calloc(1, sizeof(struct rgp_code_object_record));
1038 if (!record)
1039 return false;
1040
1041 record->shader_stages_mask = 0;
1042 record->num_shaders_combined = 0;
1043 record->pipeline_hash[0] = pipeline->code_hash;
1044 record->pipeline_hash[1] = pipeline->code_hash;
1045
1046 for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
1047 struct si_shader *shader;
1048 enum rgp_hardware_stages hw_stage;
1049
1050 if (is_compute) {
1051 if (i != PIPE_SHADER_COMPUTE)
1052 continue;
1053 shader = &sctx->cs_shader_state.program->shader;
1054 hw_stage = RGP_HW_STAGE_CS;
1055 } else if (i != PIPE_SHADER_COMPUTE) {
1056 if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
1057 continue;
1058 shader = sctx->shaders[i].current;
1059 hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
1060 } else {
1061 continue;
1062 }
1063
1064 uint8_t *code = malloc(shader->binary.uploaded_code_size);
1065 if (!code) {
1066 free(record);
1067 return false;
1068 }
1069 memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
1070
1071 uint64_t va = pipeline->bo->gpu_address + pipeline->offset[i];
1072 unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i);
1073 record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
1074 record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0];
1075 record->shader_data[gl_shader_stage].code_size = shader->binary.uploaded_code_size;
1076 record->shader_data[gl_shader_stage].code = code;
1077 record->shader_data[gl_shader_stage].vgpr_count = shader->config.num_vgprs;
1078 record->shader_data[gl_shader_stage].sgpr_count = shader->config.num_sgprs;
1079 record->shader_data[gl_shader_stage].base_address = va & 0xffffffffffff;
1080 record->shader_data[gl_shader_stage].elf_symbol_offset = 0;
1081 record->shader_data[gl_shader_stage].hw_stage = hw_stage;
1082 record->shader_data[gl_shader_stage].is_combined = false;
1083 record->shader_data[gl_shader_stage].scratch_memory_size = shader->config.scratch_bytes_per_wave;
1084 record->shader_data[gl_shader_stage].wavefront_size = shader->wave_size;
1085
1086 record->shader_stages_mask |= 1 << gl_shader_stage;
1087 record->num_shaders_combined++;
1088 }
1089
1090 simple_mtx_lock(&code_object->lock);
1091 list_addtail(&record->list, &code_object->record);
1092 code_object->record_count++;
1093 simple_mtx_unlock(&code_object->lock);
1094
1095 return true;
1096 }
1097
si_sqtt_register_pipeline(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,bool is_compute)1098 bool si_sqtt_register_pipeline(struct si_context *sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute)
1099 {
1100 assert(!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline->code_hash));
1101
1102 bool result = ac_sqtt_add_pso_correlation(sctx->sqtt, pipeline->code_hash, pipeline->code_hash);
1103 if (!result)
1104 return false;
1105
1106 result = ac_sqtt_add_code_object_loader_event(
1107 sctx->sqtt, pipeline->code_hash, pipeline->bo->gpu_address);
1108 if (!result)
1109 return false;
1110
1111 return si_sqtt_add_code_object(sctx, pipeline, is_compute);
1112 }
1113
si_sqtt_describe_pipeline_bind(struct si_context * sctx,uint64_t pipeline_hash,int bind_point)1114 void si_sqtt_describe_pipeline_bind(struct si_context *sctx,
1115 uint64_t pipeline_hash,
1116 int bind_point)
1117 {
1118 struct rgp_sqtt_marker_pipeline_bind marker = {0};
1119 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1120
1121 if (likely(!sctx->sqtt_enabled)) {
1122 return;
1123 }
1124
1125 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
1126 marker.cb_id = 0;
1127 marker.bind_point = bind_point;
1128 marker.api_pso_hash[0] = pipeline_hash;
1129 marker.api_pso_hash[1] = pipeline_hash >> 32;
1130
1131 si_emit_sqtt_userdata(sctx, cs, &marker, sizeof(marker) / 4);
1132 }
1133