• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "amd_family.h"
8 #include "si_build_pm4.h"
9 #include "si_pipe.h"
10 
11 #include "tgsi/tgsi_from_mesa.h"
12 #include "util/hash_table.h"
13 #include "util/u_debug.h"
14 #include "util/u_memory.h"
15 #include "ac_rgp.h"
16 #include "ac_sqtt.h"
17 
18 static void
19 si_emit_spi_config_cntl(struct si_context *sctx,
20                         struct radeon_cmdbuf *cs, bool enable);
21 
si_sqtt_init_bo(struct si_context * sctx)22 static bool si_sqtt_init_bo(struct si_context *sctx)
23 {
24    unsigned max_se = sctx->screen->info.max_se;
25    struct radeon_winsys *ws = sctx->ws;
26    uint64_t size;
27 
28    /* The buffer size and address need to be aligned in HW regs. Align the
29     * size as early as possible so that we do all the allocation & addressing
30     * correctly. */
31    sctx->sqtt->buffer_size =
32       align64(sctx->sqtt->buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
33 
34    /* Compute total size of the thread trace BO for all SEs. */
35    size = align64(sizeof(struct ac_sqtt_data_info) * max_se,
36                   1 << SQTT_BUFFER_ALIGN_SHIFT);
37    size += sctx->sqtt->buffer_size * (uint64_t)max_se;
38 
39    sctx->sqtt->pipeline_bos = _mesa_hash_table_u64_create(NULL);
40 
41    sctx->sqtt->bo =
42       ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_VRAM,
43                         RADEON_FLAG_NO_INTERPROCESS_SHARING |
44                            RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_SUBALLOC);
45    if (!sctx->sqtt->bo)
46       return false;
47 
48    return true;
49 }
50 
si_emit_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)51 static void si_emit_sqtt_start(struct si_context *sctx,
52                                struct radeon_cmdbuf *cs,
53                                enum amd_ip_type ip_type)
54 {
55    struct si_screen *sscreen = sctx->screen;
56    uint32_t shifted_size = sctx->sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
57    unsigned max_se = sscreen->info.max_se;
58 
59    radeon_begin(cs);
60 
61    for (unsigned se = 0; se < max_se; se++) {
62       uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
63       uint64_t data_va =
64          ac_sqtt_get_data_va(&sctx->screen->info, sctx->sqtt, va, se);
65       uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
66 
67       if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
68          continue;
69 
70       /* Target SEx and SH0. */
71       radeon_set_uconfig_perfctr_reg_seq(R_030800_GRBM_GFX_INDEX, 1);
72       radeon_emit(S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) |
73                   S_030800_INSTANCE_BROADCAST_WRITES(1));
74 
75       /* Select the first active CUs */
76       int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
77 
78       if (sctx->gfx_level >= GFX10) {
79          uint32_t token_mask =
80             V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC |
81             V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_CONTEXT |
82             V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONFIG;
83          int wgp = first_active_cu / 2;
84          unsigned shader_mask = 0x7f; /* all shader stages */
85 
86          /* Order seems important for the following 2 registers. */
87          if (sctx->gfx_level >= GFX11) {
88             /* Disable unsupported hw shader stages */
89             shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
90 
91             radeon_set_uconfig_perfctr_reg_seq(R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, 2);
92             radeon_emit(shifted_va);
93             radeon_emit(S_0367A4_SIZE(shifted_size) |
94                         S_0367A4_BASE_HI(shifted_va >> 32));
95 
96             radeon_set_uconfig_perfctr_reg_seq(R_0367B4_SQ_THREAD_TRACE_MASK, 2);
97             radeon_emit(S_0367B4_WTYPE_INCLUDE(shader_mask) |
98                         S_0367B4_SA_SEL(0) | S_0367B4_WGP_SEL(wgp) |
99                         S_0367B4_SIMD_SEL(0));
100             radeon_emit(S_0367B8_REG_INCLUDE(token_mask) |
101                         S_0367B8_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
102          } else {
103             radeon_set_privileged_config_reg(
104                R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
105                S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
106 
107             radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE,
108                                              shifted_va);
109 
110             radeon_set_privileged_config_reg(
111                R_008D14_SQ_THREAD_TRACE_MASK,
112                S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) |
113                S_008D14_WGP_SEL(wgp) | S_008D14_SIMD_SEL(0));
114 
115             radeon_set_privileged_config_reg(
116                R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
117                S_008D18_REG_INCLUDE(token_mask) |
118                S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
119          }
120 
121          /* Should be emitted last (it enables thread traces). */
122          uint32_t ctrl = S_008D1C_MODE(1) | S_008D1C_HIWATER(5) |
123                          S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) /* 4096 clk */ |
124                          S_008D1C_DRAW_EVENT_EN(1);
125 
126          if (sctx->gfx_level == GFX10_3)
127             ctrl |= S_008D1C_LOWATER_OFFSET(4);
128 
129          ctrl |= S_008D1C_AUTO_FLUSH_MODE(
130             sctx->screen->info.has_sqtt_auto_flush_mode_bug);
131 
132          switch (sctx->gfx_level) {
133             case GFX10:
134             case GFX10_3:
135                ctrl |= S_008D1C_REG_STALL_EN(1) |
136                        S_008D1C_SPI_STALL_EN(1) |
137                        S_008D1C_SQ_STALL_EN(1) |
138                        S_008D1C_REG_DROP_ON_STALL(0);
139                radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, ctrl);
140                break;
141             case GFX11:
142                ctrl |= S_0367B0_SPI_STALL_EN(1) |
143                        S_0367B0_SQ_STALL_EN(1) |
144                        S_0367B0_REG_AT_HWM(2);
145                radeon_set_uconfig_perfctr_reg_seq(R_0367B0_SQ_THREAD_TRACE_CTRL, 1);
146                radeon_emit(ctrl);
147                break;
148             default:
149                assert(false);
150          }
151       } else {
152          /* Order seems important for the following 4 registers. */
153          radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2,
154                                 S_030CDC_ADDR_HI(shifted_va >> 32));
155 
156          radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
157 
158          radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE,
159                                 S_030CC4_SIZE(shifted_size));
160 
161          radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL,
162                                 S_030CD4_RESET_BUFFER(1));
163 
164          uint32_t sqtt_mask = S_030CC8_CU_SEL(first_active_cu) |
165                               S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) |
166                               S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) |
167                               S_030CC8_SPI_STALL_EN(1) | S_030CC8_SQ_STALL_EN(1);
168 
169          radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
170 
171          /* Trace all tokens and registers. */
172          radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
173                                 S_030CCC_TOKEN_MASK(0xbfff) |
174                                 S_030CCC_REG_MASK(0xff) |
175                                 S_030CCC_REG_DROP_ON_STALL(0));
176 
177          /* Enable SQTT perf counters for all CUs. */
178          radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
179                                 S_030CD0_SH0_MASK(0xffff) |
180                                 S_030CD0_SH1_MASK(0xffff));
181 
182          radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
183 
184          radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER,
185                                 S_030CEC_HIWATER(4));
186 
187          if (sctx->gfx_level == GFX9) {
188             /* Reset thread trace status errors. */
189             radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS,
190                                    S_030CE8_UTC_ERROR(0));
191          }
192 
193          /* Enable the thread trace mode. */
194          uint32_t sqtt_mode = S_030CD8_MASK_PS(1) |
195                               S_030CD8_MASK_VS(1) |
196                               S_030CD8_MASK_GS(1) |
197                               S_030CD8_MASK_ES(1) |
198                               S_030CD8_MASK_HS(1) |
199                               S_030CD8_MASK_LS(1) |
200                               S_030CD8_MASK_CS(1) |
201                               S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
202                               S_030CD8_MODE(1);
203 
204          if (sctx->gfx_level == GFX9) {
205             /* Count SQTT traffic in TCC perf counters. */
206             sqtt_mode |= S_030CD8_TC_PERF_EN(1);
207          }
208 
209          radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
210       }
211    }
212 
213    /* Restore global broadcasting. */
214    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
215                           S_030800_SE_BROADCAST_WRITES(1) |
216                           S_030800_SH_BROADCAST_WRITES(1) |
217                           S_030800_INSTANCE_BROADCAST_WRITES(1));
218 
219    /* Start the thread trace with a different event based on the queue. */
220    if (ip_type == AMD_IP_COMPUTE) {
221       radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
222                         S_00B878_THREAD_TRACE_ENABLE(1));
223    } else {
224       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
225       radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
226    }
227    radeon_end();
228 }
229 
230 static const uint32_t gfx9_sqtt_info_regs[] = {
231    R_030CE4_SQ_THREAD_TRACE_WPTR,
232    R_030CE8_SQ_THREAD_TRACE_STATUS,
233    R_030CF0_SQ_THREAD_TRACE_CNTR,
234 };
235 
236 static const uint32_t gfx10_sqtt_info_regs[] = {
237    R_008D10_SQ_THREAD_TRACE_WPTR,
238    R_008D20_SQ_THREAD_TRACE_STATUS,
239    R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
240 };
241 
242 static const uint32_t gfx11_sqtt_info_regs[] = {
243    R_0367BC_SQ_THREAD_TRACE_WPTR,
244    R_0367D0_SQ_THREAD_TRACE_STATUS,
245    R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
246 };
247 
si_copy_sqtt_info_regs(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned se_index)248 static void si_copy_sqtt_info_regs(struct si_context *sctx,
249                                    struct radeon_cmdbuf *cs,
250                                    unsigned se_index)
251 {
252    const uint32_t *sqtt_info_regs = NULL;
253 
254    switch (sctx->gfx_level) {
255       case GFX10_3:
256       case GFX10:
257          sqtt_info_regs = gfx10_sqtt_info_regs;
258          break;
259       case GFX11:
260          sqtt_info_regs = gfx11_sqtt_info_regs;
261          break;
262       case GFX9:
263          sqtt_info_regs = gfx9_sqtt_info_regs;
264          break;
265       default:
266          unreachable("Unsupported gfx_level");
267    }
268 
269    /* Get the VA where the info struct is stored for this SE. */
270    uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
271    uint64_t info_va = ac_sqtt_get_info_va(va, se_index);
272 
273    radeon_begin(cs);
274 
275    /* Copy back the info struct one DWORD at a time. */
276    for (unsigned i = 0; i < 3; i++) {
277       radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
278       radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
279                   COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM);
280       radeon_emit(sqtt_info_regs[i] >> 2);
281       radeon_emit(0); /* unused */
282       radeon_emit((info_va + i * 4));
283       radeon_emit((info_va + i * 4) >> 32);
284    }
285 
286    if (sctx->gfx_level == GFX11) {
287       /* On GFX11, WPTR is incremented from the offset of the current buffer base
288        * address and it needs to be subtracted to get the correct offset:
289        *
290        * 1) get the current buffer base address for this SE
291        * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
292        * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
293        */
294       uint64_t data_va =
295          ac_sqtt_get_data_va(&sctx->screen->info, sctx->sqtt, va, se_index);
296       uint64_t shifted_data_va = (data_va >> 5);
297       uint64_t init_wptr_value = shifted_data_va & 0x1fffffff;
298 
299       radeon_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0));
300       radeon_emit(ATOMIC_OP(TC_OP_ATOMIC_SUB_32));
301       radeon_emit(info_va);
302       radeon_emit(info_va >> 32);
303       radeon_emit(init_wptr_value);
304       radeon_emit(init_wptr_value >> 32);
305       radeon_emit(0);
306       radeon_emit(0);
307       radeon_emit(0);
308    }
309 
310    radeon_end();
311 }
312 
si_emit_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)313 static void si_emit_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs,
314                               enum amd_ip_type ip_type)
315 {
316    unsigned max_se = sctx->screen->info.max_se;
317    radeon_begin(cs);
318 
319    /* Stop the thread trace with a different event based on the queue. */
320    if (ip_type == AMD_IP_COMPUTE) {
321       radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
322                         S_00B878_THREAD_TRACE_ENABLE(0));
323    } else {
324       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
325       radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
326    }
327 
328    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
329    radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
330    radeon_end();
331 
332    if (sctx->screen->info.has_sqtt_rb_harvest_bug) {
333       /* Some chips with disabled RBs should wait for idle because FINISH_DONE
334        * doesn't work. */
335       sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |
336                      SI_CONTEXT_CS_PARTIAL_FLUSH;
337       sctx->emit_cache_flush(sctx, cs);
338    }
339 
340    for (unsigned se = 0; se < max_se; se++) {
341       if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
342          continue;
343 
344       radeon_begin(cs);
345 
346       /* Target SEi and SH0. */
347       radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
348                              S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) |
349                              S_030800_INSTANCE_BROADCAST_WRITES(1));
350 
351       if (sctx->gfx_level >= GFX10) {
352          uint32_t tt_status_reg = sctx->gfx_level >= GFX11 ?
353                                      R_0367D0_SQ_THREAD_TRACE_STATUS:
354                                      R_008D20_SQ_THREAD_TRACE_STATUS;
355          if (!sctx->screen->info.has_sqtt_rb_harvest_bug) {
356             /* Make sure to wait for the trace buffer. */
357             radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
358             radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal
359                                                     to the reference value */
360             radeon_emit(tt_status_reg >> 2);     /* register */
361             radeon_emit(0);
362             radeon_emit(0);                      /* reference value */
363             radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_FINISH_DONE:
364                                                    ~C_008D20_FINISH_DONE); /* mask */
365             radeon_emit(4);                          /* poll interval */
366          }
367 
368          /* Disable the thread trace mode. */
369          if (sctx->gfx_level >= GFX11) {
370             radeon_set_uconfig_perfctr_reg_seq(R_0367B0_SQ_THREAD_TRACE_CTRL, 1);
371             radeon_emit(S_008D1C_MODE(0));
372          } else {
373             radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
374                                              S_008D1C_MODE(0));
375          }
376 
377          /* Wait for thread trace completion. */
378          radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
379          radeon_emit(WAIT_REG_MEM_EQUAL);                        /* wait until the register is equal to
380                                                                     the reference value */
381          radeon_emit(tt_status_reg >> 2);                        /* register */
382          radeon_emit(0);
383          radeon_emit(0);                                         /* reference value */
384          radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_BUSY:
385                                                 ~C_008D20_BUSY); /* mask */
386          radeon_emit(4);                                         /* poll interval */
387       } else {
388          /* Disable the thread trace mode. */
389          radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
390 
391          /* Wait for thread trace completion. */
392          radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
393          radeon_emit(WAIT_REG_MEM_EQUAL);                   /* wait until the register is equal to
394                                                                the reference value */
395          radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
396          radeon_emit(0);
397          radeon_emit(0);                                    /* reference value */
398          radeon_emit(~C_030CE8_BUSY);                       /* mask */
399          radeon_emit(4);                                    /* poll interval */
400       }
401       radeon_end();
402 
403       si_copy_sqtt_info_regs(sctx, cs, se);
404    }
405 
406    /* Restore global broadcasting. */
407    radeon_begin_again(cs);
408    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
409                           S_030800_SE_BROADCAST_WRITES(1) |
410                           S_030800_SH_BROADCAST_WRITES(1) |
411                           S_030800_INSTANCE_BROADCAST_WRITES(1));
412    radeon_end();
413 }
414 
si_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs)415 static void si_sqtt_start(struct si_context *sctx, struct radeon_cmdbuf *cs)
416 {
417    struct radeon_winsys *ws = sctx->ws;
418    enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
419 
420    radeon_begin(cs);
421 
422    switch (ip_type) {
423       case AMD_IP_GFX:
424          radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
425          radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
426          radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
427          break;
428       case AMD_IP_COMPUTE:
429          radeon_emit(PKT3(PKT3_NOP, 0, 0));
430          radeon_emit(0);
431          break;
432       default:
433         /* Unsupported. */
434         assert(false);
435    }
436    radeon_end();
437 
438    ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
439                      RADEON_DOMAIN_VRAM);
440    if (sctx->spm.bo)
441       ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
442                         RADEON_DOMAIN_VRAM);
443 
444    si_cp_dma_wait_for_idle(sctx, cs);
445 
446    /* Make sure to wait-for-idle before starting SQTT. */
447    sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
448                   SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE |
449                   SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 |
450                   SI_CONTEXT_PFP_SYNC_ME;
451    sctx->emit_cache_flush(sctx, cs);
452 
453    si_inhibit_clockgating(sctx, cs, true);
454 
455    /* Enable SQG events that collects thread trace data. */
456    si_emit_spi_config_cntl(sctx, cs, true);
457 
458    if (sctx->spm.bo) {
459       si_pc_emit_spm_reset(cs);
460       si_pc_emit_shaders(cs, 0x7f);
461       si_emit_spm_setup(sctx, cs);
462    }
463 
464    si_emit_sqtt_start(sctx, cs, ip_type);
465 
466    if (sctx->spm.bo)
467       si_pc_emit_spm_start(cs);
468 }
469 
si_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs)470 static void si_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs)
471 {
472    struct radeon_winsys *ws = sctx->ws;
473    enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
474 
475    radeon_begin(cs);
476 
477    switch (ip_type) {
478       case AMD_IP_GFX:
479          radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
480          radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
481          radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
482          break;
483       case AMD_IP_COMPUTE:
484          radeon_emit(PKT3(PKT3_NOP, 0, 0));
485          radeon_emit(0);
486          break;
487       default:
488         /* Unsupported. */
489         assert(false);
490    }
491    radeon_end();
492 
493    ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
494                      RADEON_DOMAIN_VRAM);
495 
496    if (sctx->spm.bo)
497       ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
498                         RADEON_DOMAIN_VRAM);
499 
500    si_cp_dma_wait_for_idle(sctx, cs);
501 
502    if (sctx->spm.bo)
503       si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters,
504                           sctx->screen->info.never_send_perfcounter_stop);
505 
506    /* Make sure to wait-for-idle before stopping SQTT. */
507    sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
508                   SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE |
509                   SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 |
510                   SI_CONTEXT_PFP_SYNC_ME;
511    sctx->emit_cache_flush(sctx, cs);
512 
513    si_emit_sqtt_stop(sctx, cs, ip_type);
514 
515    if (sctx->spm.bo)
516       si_pc_emit_spm_reset(cs);
517 
518    /* Restore previous state by disabling SQG events. */
519    si_emit_spi_config_cntl(sctx, cs, false);
520 
521    si_inhibit_clockgating(sctx, cs, false);
522 }
523 
si_sqtt_init_cs(struct si_context * sctx)524 static void si_sqtt_init_cs(struct si_context *sctx)
525 {
526    struct radeon_winsys *ws = sctx->ws;
527 
528    for (unsigned i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
529       sctx->sqtt->start_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
530       if (!ws->cs_create(sctx->sqtt->start_cs[i], sctx->ctx, (enum amd_ip_type)i,
531                          NULL, NULL)) {
532          free(sctx->sqtt->start_cs[i]);
533          sctx->sqtt->start_cs[i] = NULL;
534          return;
535       }
536       si_sqtt_start(sctx, sctx->sqtt->start_cs[i]);
537 
538       sctx->sqtt->stop_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
539       if (!ws->cs_create(sctx->sqtt->stop_cs[i], sctx->ctx, (enum amd_ip_type)i,
540                          NULL, NULL)) {
541          ws->cs_destroy(sctx->sqtt->start_cs[i]);
542          free(sctx->sqtt->start_cs[i]);
543          sctx->sqtt->start_cs[i] = NULL;
544          free(sctx->sqtt->stop_cs[i]);
545          sctx->sqtt->stop_cs[i] = NULL;
546          return;
547       }
548 
549       si_sqtt_stop(sctx, sctx->sqtt->stop_cs[i]);
550    }
551 }
552 
si_begin_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)553 static void si_begin_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
554 {
555    struct radeon_cmdbuf *cs = sctx->sqtt->start_cs[sctx->ws->cs_get_ip_type(rcs)];
556    sctx->ws->cs_flush(cs, 0, NULL);
557 }
558 
si_end_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)559 static void si_end_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
560 {
561    struct radeon_cmdbuf *cs = sctx->sqtt->stop_cs[sctx->ws->cs_get_ip_type(rcs)];
562    sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
563 }
564 
si_get_sqtt_trace(struct si_context * sctx,struct ac_sqtt_trace * sqtt)565 static bool si_get_sqtt_trace(struct si_context *sctx,
566                               struct ac_sqtt_trace *sqtt)
567 {
568    unsigned max_se = sctx->screen->info.max_se;
569 
570    memset(sqtt, 0, sizeof(*sqtt));
571 
572    sctx->sqtt->ptr =
573       sctx->ws->buffer_map(sctx->ws, sctx->sqtt->bo, NULL, PIPE_MAP_READ);
574 
575    if (!sctx->sqtt->ptr)
576       return false;
577 
578    if (!ac_sqtt_get_trace(sctx->sqtt, &sctx->screen->info, sqtt)) {
579       void *sqtt_ptr = sctx->sqtt->ptr;
580 
581       for (unsigned se = 0; se < max_se; se++) {
582          uint64_t info_offset = ac_sqtt_get_info_offset(se);
583          void *info_ptr = sqtt_ptr + info_offset;
584          struct ac_sqtt_data_info *info = (struct ac_sqtt_data_info *)info_ptr;
585 
586          if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
587             continue;
588 
589          if (!ac_is_sqtt_complete(&sctx->screen->info, sctx->sqtt, info)) {
590             uint32_t expected_size =
591                ac_get_expected_buffer_size(&sctx->screen->info, info);
592             uint32_t available_size = (info->cur_offset * 32) / 1024;
593 
594             fprintf(stderr,
595                     "Failed to get the thread trace "
596                     "because the buffer is too small. The "
597                     "hardware needs %d KB but the "
598                     "buffer size is %d KB.\n",
599                     expected_size, available_size);
600             fprintf(stderr, "Please update the buffer size with "
601                             "AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n");
602             return false;
603          }
604       }
605    }
606 
607    return true;
608 }
609 
si_init_sqtt(struct si_context * sctx)610 bool si_init_sqtt(struct si_context *sctx)
611 {
612    static bool warn_once = true;
613    if (warn_once) {
614       fprintf(stderr, "*************************************************\n");
615       fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
616       fprintf(stderr, "*************************************************\n");
617       warn_once = false;
618    }
619 
620    sctx->sqtt = CALLOC_STRUCT(ac_sqtt);
621 
622    if (sctx->gfx_level < GFX8) {
623       fprintf(stderr, "GPU hardware not supported: refer to "
624                       "the RGP documentation for the list of "
625                       "supported GPUs!\n");
626       return false;
627    }
628 
629    if (sctx->gfx_level > GFX11) {
630       fprintf(stderr, "radeonsi: Thread trace is not supported "
631                       "for that GPU!\n");
632       return false;
633    }
634 
635    /* Default buffer size set to 32MB per SE. */
636    sctx->sqtt->buffer_size =
637       debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
638    sctx->sqtt->start_frame = 10;
639 
640    const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
641    if (trigger) {
642       sctx->sqtt->start_frame = atoi(trigger);
643       if (sctx->sqtt->start_frame <= 0) {
644          /* This isn't a frame number, must be a file */
645          sctx->sqtt->trigger_file = strdup(trigger);
646          sctx->sqtt->start_frame = -1;
647       }
648    }
649 
650    if (!si_sqtt_init_bo(sctx))
651       return false;
652 
653    ac_sqtt_init(sctx->sqtt);
654 
655    if (sctx->gfx_level >= GFX10 &&
656        debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) {
657       /* Limit SPM counters to GFX10 and GFX10_3 for now */
658       ASSERTED bool r = si_spm_init(sctx);
659       assert(r);
660    }
661 
662    si_sqtt_init_cs(sctx);
663 
664    sctx->sqtt_next_event = EventInvalid;
665 
666    return true;
667 }
668 
si_destroy_sqtt(struct si_context * sctx)669 void si_destroy_sqtt(struct si_context *sctx)
670 {
671    struct si_screen *sscreen = sctx->screen;
672    struct pb_buffer_lean *bo = sctx->sqtt->bo;
673    radeon_bo_reference(sctx->screen->ws, &bo, NULL);
674 
675    if (sctx->sqtt->trigger_file)
676       free(sctx->sqtt->trigger_file);
677 
678    for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
679       sscreen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
680       sscreen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
681    }
682 
683    struct rgp_pso_correlation *pso_correlation =
684       &sctx->sqtt->rgp_pso_correlation;
685    struct rgp_loader_events *loader_events = &sctx->sqtt->rgp_loader_events;
686    struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
687    list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
688                              &pso_correlation->record, list) {
689       list_del(&record->list);
690       pso_correlation->record_count--;
691       free(record);
692    }
693 
694    list_for_each_entry_safe (struct rgp_loader_events_record, record,
695                              &loader_events->record, list) {
696       list_del(&record->list);
697       loader_events->record_count--;
698       free(record);
699    }
700 
701    list_for_each_entry_safe (struct rgp_code_object_record, record,
702                              &code_object->record, list) {
703       uint32_t mask = record->shader_stages_mask;
704       int i;
705 
706       /* Free the disassembly. */
707       while (mask) {
708          i = u_bit_scan(&mask);
709          free(record->shader_data[i].code);
710       }
711       list_del(&record->list);
712       free(record);
713       code_object->record_count--;
714    }
715 
716    ac_sqtt_finish(sctx->sqtt);
717 
718    hash_table_foreach (sctx->sqtt->pipeline_bos->table, entry) {
719       struct si_sqtt_fake_pipeline *pipeline =
720          (struct si_sqtt_fake_pipeline *)entry->data;
721       si_resource_reference(&pipeline->bo, NULL);
722       FREE(pipeline);
723    }
724 
725    free(sctx->sqtt);
726    sctx->sqtt = NULL;
727 
728    if (sctx->spm.bo)
729       si_spm_finish(sctx);
730 }
731 
732 static uint64_t num_frames = 0;
733 
si_handle_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)734 void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
735 {
736    /* Should we enable SQTT yet? */
737    if (!sctx->sqtt_enabled) {
738       bool frame_trigger = num_frames == sctx->sqtt->start_frame;
739       bool file_trigger = false;
740       if (sctx->sqtt->trigger_file &&
741           access(sctx->sqtt->trigger_file, W_OK) == 0) {
742          if (unlink(sctx->sqtt->trigger_file) == 0) {
743             file_trigger = true;
744          } else {
745             /* Do not enable tracing if we cannot remove the file,
746              * because by then we'll trace every frame.
747              */
748             fprintf(stderr, "radeonsi: could not remove thread "
749                             "trace trigger file, ignoring\n");
750          }
751       }
752 
753       if (frame_trigger || file_trigger) {
754          /* Wait for last submission */
755          sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence,
756                               OS_TIMEOUT_INFINITE);
757 
758          /* Start SQTT */
759          si_begin_sqtt(sctx, rcs);
760 
761          sctx->sqtt_enabled = true;
762          sctx->sqtt->start_frame = -1;
763 
764          /* Force shader update to make sure si_sqtt_describe_pipeline_bind is
765           * called for the current "pipeline".
766           */
767          sctx->do_update_shaders = true;
768       }
769    } else {
770       struct ac_sqtt_trace sqtt_trace = {0};
771 
772       /* Stop SQTT */
773       si_end_sqtt(sctx, rcs);
774       sctx->sqtt_enabled = false;
775       sctx->sqtt->start_frame = -1;
776       assert(sctx->last_sqtt_fence);
777 
778       /* Wait for SQTT to finish and read back the bo */
779       if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence,
780                                OS_TIMEOUT_INFINITE) &&
781           si_get_sqtt_trace(sctx, &sqtt_trace)) {
782          struct ac_spm_trace spm_trace;
783 
784          /* Map the SPM counter buffer */
785          if (sctx->spm.bo) {
786             sctx->spm.ptr = sctx->ws->buffer_map(
787                sctx->ws, sctx->spm.bo, NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY);
788             ac_spm_get_trace(&sctx->spm, &spm_trace);
789          }
790 
791          ac_dump_rgp_capture(&sctx->screen->info, &sqtt_trace,
792                              sctx->spm.bo ? &spm_trace : NULL);
793 
794          if (sctx->spm.ptr)
795             sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo);
796       } else {
797          fprintf(stderr, "Failed to read the trace\n");
798       }
799    }
800 
801    num_frames++;
802 }
803 
si_emit_sqtt_userdata(struct si_context * sctx,struct radeon_cmdbuf * cs,const void * data,uint32_t num_dwords)804 static void si_emit_sqtt_userdata(struct si_context *sctx,
805                                   struct radeon_cmdbuf *cs, const void *data,
806                                   uint32_t num_dwords)
807 {
808    const uint32_t *dwords = (uint32_t *)data;
809 
810    radeon_begin(cs);
811 
812    while (num_dwords > 0) {
813       uint32_t count = MIN2(num_dwords, 2);
814 
815       radeon_set_uconfig_perfctr_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
816       radeon_emit_array(dwords, count);
817 
818       dwords += count;
819       num_dwords -= count;
820    }
821    radeon_end();
822 }
823 
824 static void
si_emit_spi_config_cntl(struct si_context * sctx,struct radeon_cmdbuf * cs,bool enable)825 si_emit_spi_config_cntl(struct si_context *sctx,
826                         struct radeon_cmdbuf *cs, bool enable)
827 {
828    radeon_begin(cs);
829 
830    if (sctx->gfx_level >= GFX9) {
831       uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
832                                  S_031100_EXP_PRIORITY_ORDER(3) |
833                                  S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
834                                  S_031100_ENABLE_SQG_BOP_EVENTS(enable);
835 
836       if (sctx->gfx_level >= GFX10)
837          spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
838 
839       radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
840    } else {
841       /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
842       radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
843                                        S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
844                                        S_009100_ENABLE_SQG_BOP_EVENTS(enable));
845    }
846    radeon_end();
847 }
848 
849 static uint32_t num_events = 0;
si_sqtt_write_event_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t vertex_offset_user_data,uint32_t instance_offset_user_data,uint32_t draw_index_user_data)850 void si_sqtt_write_event_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
851                                 enum rgp_sqtt_marker_event_type api_type,
852                                 uint32_t vertex_offset_user_data,
853                                 uint32_t instance_offset_user_data,
854                                 uint32_t draw_index_user_data)
855 {
856    struct rgp_sqtt_marker_event marker = {0};
857 
858    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
859    marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
860    marker.cmd_id = num_events++;
861    marker.cb_id = 0;
862 
863    if (vertex_offset_user_data == UINT_MAX ||
864        instance_offset_user_data == UINT_MAX) {
865       vertex_offset_user_data = 0;
866       instance_offset_user_data = 0;
867    }
868 
869    if (draw_index_user_data == UINT_MAX)
870       draw_index_user_data = vertex_offset_user_data;
871 
872    marker.vertex_offset_reg_idx = vertex_offset_user_data;
873    marker.instance_offset_reg_idx = instance_offset_user_data;
874    marker.draw_index_reg_idx = draw_index_user_data;
875 
876    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
877 
878    sctx->sqtt_next_event = EventInvalid;
879 }
880 
si_write_event_with_dims_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t x,uint32_t y,uint32_t z)881 void si_write_event_with_dims_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
882                                      enum rgp_sqtt_marker_event_type api_type,
883                                      uint32_t x, uint32_t y, uint32_t z)
884 {
885    struct rgp_sqtt_marker_event_with_dims marker = {0};
886 
887    marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
888    marker.event.api_type = api_type;
889    marker.event.cmd_id = num_events++;
890    marker.event.cb_id = 0;
891    marker.event.has_thread_dims = 1;
892 
893    marker.thread_x = x;
894    marker.thread_y = y;
895    marker.thread_z = z;
896 
897    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
898    sctx->sqtt_next_event = EventInvalid;
899 }
900 
si_sqtt_describe_barrier_start(struct si_context * sctx,struct radeon_cmdbuf * rcs)901 void si_sqtt_describe_barrier_start(struct si_context *sctx, struct radeon_cmdbuf *rcs)
902 {
903    struct rgp_sqtt_marker_barrier_start marker = {0};
904 
905    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
906    marker.cb_id = 0;
907    marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
908 
909    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
910 }
911 
si_sqtt_describe_barrier_end(struct si_context * sctx,struct radeon_cmdbuf * rcs,unsigned flags)912 void si_sqtt_describe_barrier_end(struct si_context *sctx, struct radeon_cmdbuf *rcs,
913                                   unsigned flags)
914 {
915    struct rgp_sqtt_marker_barrier_end marker = {0};
916 
917    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
918    marker.cb_id = 0;
919 
920    if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH)
921       marker.vs_partial_flush = true;
922    if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH)
923       marker.ps_partial_flush = true;
924    if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH)
925       marker.cs_partial_flush = true;
926 
927    if (flags & SI_CONTEXT_PFP_SYNC_ME)
928       marker.pfp_sync_me = true;
929 
930    if (flags & SI_CONTEXT_INV_VCACHE)
931       marker.inval_tcp = true;
932    if (flags & SI_CONTEXT_INV_ICACHE)
933       marker.inval_sqI = true;
934    if (flags & SI_CONTEXT_INV_SCACHE)
935       marker.inval_sqK = true;
936    if (flags & SI_CONTEXT_INV_L2)
937       marker.inval_tcc = true;
938 
939    if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
940       marker.inval_cb = true;
941       marker.flush_cb = true;
942    }
943    if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
944       marker.inval_db = true;
945       marker.flush_db = true;
946    }
947 
948    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
949 }
950 
si_write_user_event(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_user_event_type type,const char * str,int len)951 void si_write_user_event(struct si_context *sctx, struct radeon_cmdbuf *rcs,
952                          enum rgp_sqtt_marker_user_event_type type,
953                          const char *str, int len)
954 {
955    if (type == UserEventPop) {
956       assert(str == NULL);
957       struct rgp_sqtt_marker_user_event marker = {0};
958       marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
959       marker.data_type = type;
960 
961       si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
962    } else {
963       assert(str != NULL);
964       struct rgp_sqtt_marker_user_event_with_length marker = {0};
965       marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
966       marker.user_event.data_type = type;
967       len = MIN2(1024, len);
968       marker.length = align(len, 4);
969 
970       uint8_t *buffer = alloca(sizeof(marker) + marker.length);
971       memcpy(buffer, &marker, sizeof(marker));
972       memcpy(buffer + sizeof(marker), str, len);
973       buffer[sizeof(marker) + len - 1] = '\0';
974 
975       si_emit_sqtt_userdata(sctx, rcs, buffer,
976                             sizeof(marker) / 4 + marker.length / 4);
977    }
978 }
979 
si_sqtt_pipeline_is_registered(struct ac_sqtt * sqtt,uint64_t pipeline_hash)980 bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt,
981                                     uint64_t pipeline_hash)
982 {
983    simple_mtx_lock(&sqtt->rgp_pso_correlation.lock);
984    list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
985                              &sqtt->rgp_pso_correlation.record, list) {
986       if (record->pipeline_hash[0] == pipeline_hash) {
987          simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
988          return true;
989       }
990    }
991    simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
992 
993    return false;
994 }
995 
996 static enum rgp_hardware_stages
si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key * key,enum pipe_shader_type stage)997 si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key *key, enum pipe_shader_type stage)
998 {
999    switch (stage) {
1000       case PIPE_SHADER_VERTEX:
1001          if (key->ge.as_ls)
1002             return RGP_HW_STAGE_LS;
1003          else if (key->ge.as_es)
1004             return RGP_HW_STAGE_ES;
1005          else if (key->ge.as_ngg)
1006             return RGP_HW_STAGE_GS;
1007          else
1008             return RGP_HW_STAGE_VS;
1009       case PIPE_SHADER_TESS_CTRL:
1010          return RGP_HW_STAGE_HS;
1011       case PIPE_SHADER_TESS_EVAL:
1012          if (key->ge.as_es)
1013             return RGP_HW_STAGE_ES;
1014          else if (key->ge.as_ngg)
1015             return RGP_HW_STAGE_GS;
1016          else
1017             return RGP_HW_STAGE_VS;
1018       case PIPE_SHADER_GEOMETRY:
1019          return RGP_HW_STAGE_GS;
1020       case PIPE_SHADER_FRAGMENT:
1021          return RGP_HW_STAGE_PS;
1022       case PIPE_SHADER_COMPUTE:
1023          return RGP_HW_STAGE_CS;
1024       default:
1025          unreachable("invalid mesa shader stage");
1026    }
1027 }
1028 
1029 static bool
si_sqtt_add_code_object(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,bool is_compute)1030 si_sqtt_add_code_object(struct si_context *sctx,
1031                         struct si_sqtt_fake_pipeline *pipeline,
1032                         bool is_compute)
1033 {
1034    struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
1035    struct rgp_code_object_record *record;
1036 
1037    record = calloc(1, sizeof(struct rgp_code_object_record));
1038    if (!record)
1039       return false;
1040 
1041    record->shader_stages_mask = 0;
1042    record->num_shaders_combined = 0;
1043    record->pipeline_hash[0] = pipeline->code_hash;
1044    record->pipeline_hash[1] = pipeline->code_hash;
1045 
1046    for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
1047       struct si_shader *shader;
1048       enum rgp_hardware_stages hw_stage;
1049 
1050       if (is_compute) {
1051          if (i != PIPE_SHADER_COMPUTE)
1052             continue;
1053          shader = &sctx->cs_shader_state.program->shader;
1054          hw_stage = RGP_HW_STAGE_CS;
1055       } else if (i != PIPE_SHADER_COMPUTE) {
1056          if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
1057             continue;
1058          shader = sctx->shaders[i].current;
1059          hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
1060       } else {
1061          continue;
1062       }
1063 
1064       uint8_t *code = malloc(shader->binary.uploaded_code_size);
1065       if (!code) {
1066          free(record);
1067          return false;
1068       }
1069       memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
1070 
1071       uint64_t va = pipeline->bo->gpu_address + pipeline->offset[i];
1072       unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i);
1073       record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
1074       record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0];
1075       record->shader_data[gl_shader_stage].code_size = shader->binary.uploaded_code_size;
1076       record->shader_data[gl_shader_stage].code = code;
1077       record->shader_data[gl_shader_stage].vgpr_count = shader->config.num_vgprs;
1078       record->shader_data[gl_shader_stage].sgpr_count = shader->config.num_sgprs;
1079       record->shader_data[gl_shader_stage].base_address = va & 0xffffffffffff;
1080       record->shader_data[gl_shader_stage].elf_symbol_offset = 0;
1081       record->shader_data[gl_shader_stage].hw_stage = hw_stage;
1082       record->shader_data[gl_shader_stage].is_combined = false;
1083       record->shader_data[gl_shader_stage].scratch_memory_size = shader->config.scratch_bytes_per_wave;
1084       record->shader_data[gl_shader_stage].wavefront_size = shader->wave_size;
1085 
1086       record->shader_stages_mask |= 1 << gl_shader_stage;
1087       record->num_shaders_combined++;
1088    }
1089 
1090    simple_mtx_lock(&code_object->lock);
1091    list_addtail(&record->list, &code_object->record);
1092    code_object->record_count++;
1093    simple_mtx_unlock(&code_object->lock);
1094 
1095    return true;
1096 }
1097 
si_sqtt_register_pipeline(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,bool is_compute)1098 bool si_sqtt_register_pipeline(struct si_context *sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute)
1099 {
1100    assert(!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline->code_hash));
1101 
1102    bool result = ac_sqtt_add_pso_correlation(sctx->sqtt, pipeline->code_hash, pipeline->code_hash);
1103    if (!result)
1104       return false;
1105 
1106    result = ac_sqtt_add_code_object_loader_event(
1107       sctx->sqtt, pipeline->code_hash, pipeline->bo->gpu_address);
1108    if (!result)
1109       return false;
1110 
1111    return si_sqtt_add_code_object(sctx, pipeline, is_compute);
1112 }
1113 
si_sqtt_describe_pipeline_bind(struct si_context * sctx,uint64_t pipeline_hash,int bind_point)1114 void si_sqtt_describe_pipeline_bind(struct si_context *sctx,
1115                                     uint64_t pipeline_hash,
1116                                     int bind_point)
1117 {
1118    struct rgp_sqtt_marker_pipeline_bind marker = {0};
1119    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1120 
1121    if (likely(!sctx->sqtt_enabled)) {
1122       return;
1123    }
1124 
1125    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
1126    marker.cb_id = 0;
1127    marker.bind_point = bind_point;
1128    marker.api_pso_hash[0] = pipeline_hash;
1129    marker.api_pso_hash[1] = pipeline_hash >> 32;
1130 
1131    si_emit_sqtt_userdata(sctx, cs, &marker, sizeof(marker) / 4);
1132 }
1133