• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  */
25 
26 
27 #include "si_pipe.h"
28 #include "si_build_pm4.h"
29 #include "si_compute.h"
30 
31 #include "ac_rgp.h"
32 #include "ac_sqtt.h"
33 #include "util/u_memory.h"
34 #include "tgsi/tgsi_from_mesa.h"
35 
36 static void
37 si_emit_spi_config_cntl(struct si_context* sctx,
38                         struct radeon_cmdbuf *cs, bool enable);
39 
40 static bool
si_thread_trace_init_bo(struct si_context * sctx)41 si_thread_trace_init_bo(struct si_context *sctx)
42 {
43    unsigned max_se = sctx->screen->info.max_se;
44    struct radeon_winsys *ws = sctx->ws;
45    uint64_t size;
46 
47    /* The buffer size and address need to be aligned in HW regs. Align the
48     * size as early as possible so that we do all the allocation & addressing
49     * correctly. */
50    sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size,
51                                              1u << SQTT_BUFFER_ALIGN_SHIFT);
52 
53    /* Compute total size of the thread trace BO for all SEs. */
54    size = align64(sizeof(struct ac_thread_trace_info) * max_se,
55                   1 << SQTT_BUFFER_ALIGN_SHIFT);
56    size += sctx->thread_trace->buffer_size * (uint64_t)max_se;
57 
58    sctx->thread_trace->bo =
59       ws->buffer_create(ws, size, 4096,
60                         RADEON_DOMAIN_VRAM,
61                         RADEON_FLAG_NO_INTERPROCESS_SHARING |
62                         RADEON_FLAG_GTT_WC |
63                         RADEON_FLAG_NO_SUBALLOC);
64    if (!sctx->thread_trace->bo)
65       return false;
66 
67    return true;
68 }
69 
70 static bool
si_se_is_disabled(struct si_context * sctx,unsigned se)71 si_se_is_disabled(struct si_context* sctx, unsigned se)
72 {
73    /* No active CU on the SE means it is disabled. */
74    return sctx->screen->info.cu_mask[se][0] == 0;
75 }
76 
77 
78 static void
si_emit_thread_trace_start(struct si_context * sctx,struct radeon_cmdbuf * cs,uint32_t queue_family_index)79 si_emit_thread_trace_start(struct si_context* sctx,
80                            struct radeon_cmdbuf *cs,
81                            uint32_t queue_family_index)
82 {
83    struct si_screen *sscreen = sctx->screen;
84    uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
85    unsigned max_se = sscreen->info.max_se;
86 
87    radeon_begin(cs);
88 
89    for (unsigned se = 0; se < max_se; se++) {
90       uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
91       uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se);
92       uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
93 
94       if (si_se_is_disabled(sctx, se))
95          continue;
96 
97       /* Target SEx and SH0. */
98       radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
99                              S_030800_SE_INDEX(se) |
100                              S_030800_SH_INDEX(0) |
101                              S_030800_INSTANCE_BROADCAST_WRITES(1));
102 
103       /* Select the first active CUs */
104       int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
105 
106       if (sctx->gfx_level >= GFX10) {
107          /* Order seems important for the following 2 registers. */
108          radeon_set_privileged_config_reg(R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
109                                           S_008D04_SIZE(shifted_size) |
110                                           S_008D04_BASE_HI(shifted_va >> 32));
111 
112          radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
113 
114          int wgp = first_active_cu / 2;
115          radeon_set_privileged_config_reg(R_008D14_SQ_THREAD_TRACE_MASK,
116                                           S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */
117                                           S_008D14_SA_SEL(0) |
118                                           S_008D14_WGP_SEL(wgp) |
119                                           S_008D14_SIMD_SEL(0));
120 
121          radeon_set_privileged_config_reg(R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
122                       S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC |
123                                            V_008D18_REG_INCLUDE_SHDEC |
124                                            V_008D18_REG_INCLUDE_GFXUDEC |
125                                            V_008D18_REG_INCLUDE_CONTEXT |
126                                            V_008D18_REG_INCLUDE_COMP |
127                                            V_008D18_REG_INCLUDE_CONFIG) |
128                       S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
129 
130          /* Should be emitted last (it enables thread traces). */
131          radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
132                                           S_008D1C_MODE(1) |
133                                           S_008D1C_HIWATER(5) |
134                                           S_008D1C_UTIL_TIMER(1) |
135                                           S_008D1C_RT_FREQ(2) | /* 4096 clk */
136                                           S_008D1C_DRAW_EVENT_EN(1) |
137                                           S_008D1C_REG_STALL_EN(1) |
138                                           S_008D1C_SPI_STALL_EN(1) |
139                                           S_008D1C_SQ_STALL_EN(1) |
140                                           S_008D1C_REG_DROP_ON_STALL(0) |
141                                           S_008D1C_LOWATER_OFFSET(
142                                              sctx->gfx_level >= GFX10_3 ? 4 : 0) |
143                                           S_008D1C_AUTO_FLUSH_MODE(sctx->screen->info.has_sqtt_auto_flush_mode_bug));
144       } else {
145          /* Order seems important for the following 4 registers. */
146          radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2,
147                                 S_030CDC_ADDR_HI(shifted_va >> 32));
148 
149          radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
150 
151          radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE,
152                                 S_030CC4_SIZE(shifted_size));
153 
154          radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL,
155                                 S_030CD4_RESET_BUFFER(1));
156 
157          uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) |
158                                       S_030CC8_SH_SEL(0) |
159                                       S_030CC8_SIMD_EN(0xf) |
160                                       S_030CC8_VM_ID_MASK(0) |
161                                       S_030CC8_REG_STALL_EN(1) |
162                                       S_030CC8_SPI_STALL_EN(1) |
163                                       S_030CC8_SQ_STALL_EN(1);
164 
165          radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK,
166                                 thread_trace_mask);
167 
168          /* Trace all tokens and registers. */
169          radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
170                                 S_030CCC_TOKEN_MASK(0xbfff) |
171                                 S_030CCC_REG_MASK(0xff) |
172                                 S_030CCC_REG_DROP_ON_STALL(0));
173 
174          /* Enable SQTT perf counters for all CUs. */
175          radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
176                                 S_030CD0_SH0_MASK(0xffff) |
177                                 S_030CD0_SH1_MASK(0xffff));
178 
179          radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
180 
181          radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER,
182                                 S_030CEC_HIWATER(4));
183 
184          if (sctx->gfx_level == GFX9) {
185             /* Reset thread trace status errors. */
186             radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS,
187                                    S_030CE8_UTC_ERROR(0));
188          }
189 
190          /* Enable the thread trace mode. */
191          uint32_t thread_trace_mode =
192             S_030CD8_MASK_PS(1) |
193             S_030CD8_MASK_VS(1) |
194             S_030CD8_MASK_GS(1) |
195             S_030CD8_MASK_ES(1) |
196             S_030CD8_MASK_HS(1) |
197             S_030CD8_MASK_LS(1) |
198             S_030CD8_MASK_CS(1) |
199             S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
200             S_030CD8_MODE(1);
201 
202          if (sctx->gfx_level == GFX9) {
203             /* Count SQTT traffic in TCC perf counters. */
204             thread_trace_mode |= S_030CD8_TC_PERF_EN(1);
205          }
206 
207          radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
208                                 thread_trace_mode);
209       }
210    }
211 
212    /* Restore global broadcasting. */
213    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
214                           S_030800_SE_BROADCAST_WRITES(1) |
215                              S_030800_SH_BROADCAST_WRITES(1) |
216                              S_030800_INSTANCE_BROADCAST_WRITES(1));
217 
218    /* Start the thread trace with a different event based on the queue. */
219    if (queue_family_index == AMD_IP_COMPUTE) {
220       radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
221                         S_00B878_THREAD_TRACE_ENABLE(1));
222    } else {
223       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
224       radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
225    }
226    radeon_end();
227 }
228 
229 static const uint32_t gfx9_thread_trace_info_regs[] =
230 {
231    R_030CE4_SQ_THREAD_TRACE_WPTR,
232    R_030CE8_SQ_THREAD_TRACE_STATUS,
233    R_030CF0_SQ_THREAD_TRACE_CNTR,
234 };
235 
236 static const uint32_t gfx10_thread_trace_info_regs[] =
237 {
238    R_008D10_SQ_THREAD_TRACE_WPTR,
239    R_008D20_SQ_THREAD_TRACE_STATUS,
240    R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
241 };
242 
243 static void
si_copy_thread_trace_info_regs(struct si_context * sctx,struct radeon_cmdbuf * cs,unsigned se_index)244 si_copy_thread_trace_info_regs(struct si_context* sctx,
245              struct radeon_cmdbuf *cs,
246              unsigned se_index)
247 {
248    const uint32_t *thread_trace_info_regs = NULL;
249 
250    switch (sctx->gfx_level) {
251    case GFX10_3:
252    case GFX10:
253       thread_trace_info_regs = gfx10_thread_trace_info_regs;
254       break;
255    case GFX9:
256       thread_trace_info_regs = gfx9_thread_trace_info_regs;
257       break;
258    default:
259       unreachable("Unsupported gfx_level");
260    }
261 
262    /* Get the VA where the info struct is stored for this SE. */
263    uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
264    uint64_t info_va = ac_thread_trace_get_info_va(va, se_index);
265 
266    radeon_begin(cs);
267 
268    /* Copy back the info struct one DWORD at a time. */
269    for (unsigned i = 0; i < 3; i++) {
270       radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
271       radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
272                   COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
273                   COPY_DATA_WR_CONFIRM);
274       radeon_emit(thread_trace_info_regs[i] >> 2);
275       radeon_emit(0); /* unused */
276       radeon_emit((info_va + i * 4));
277       radeon_emit((info_va + i * 4) >> 32);
278    }
279    radeon_end();
280 }
281 
282 
283 
284 static void
si_emit_thread_trace_stop(struct si_context * sctx,struct radeon_cmdbuf * cs,uint32_t queue_family_index)285 si_emit_thread_trace_stop(struct si_context *sctx,
286                           struct radeon_cmdbuf *cs,
287                           uint32_t queue_family_index)
288 {
289    unsigned max_se = sctx->screen->info.max_se;
290 
291    radeon_begin(cs);
292 
293    /* Stop the thread trace with a different event based on the queue. */
294    if (queue_family_index == AMD_IP_COMPUTE) {
295       radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
296                         S_00B878_THREAD_TRACE_ENABLE(0));
297    } else {
298       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
299       radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
300    }
301 
302    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
303    radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
304    radeon_end();
305 
306    if (sctx->screen->info.has_sqtt_rb_harvest_bug) {
307       /* Some chips with disabled RBs should wait for idle because FINISH_DONE doesn't work. */
308       sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
309                      SI_CONTEXT_FLUSH_AND_INV_DB |
310                      SI_CONTEXT_CS_PARTIAL_FLUSH;
311       sctx->emit_cache_flush(sctx, cs);
312    }
313 
314    for (unsigned se = 0; se < max_se; se++) {
315       if (si_se_is_disabled(sctx, se))
316          continue;
317 
318       radeon_begin(cs);
319 
320       /* Target SEi and SH0. */
321       radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
322                              S_030800_SE_INDEX(se) |
323                              S_030800_SH_INDEX(0) |
324                              S_030800_INSTANCE_BROADCAST_WRITES(1));
325 
326       if (sctx->gfx_level >= GFX10) {
327          if (!sctx->screen->info.has_sqtt_rb_harvest_bug) {
328             /* Make sure to wait for the trace buffer. */
329             radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
330             radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
331             radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
332             radeon_emit(0);
333             radeon_emit(0); /* reference value */
334             radeon_emit(~C_008D20_FINISH_DONE); /* mask */
335             radeon_emit(4); /* poll interval */
336          }
337 
338          /* Disable the thread trace mode. */
339          radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
340                                           S_008D1C_MODE(0));
341 
342          /* Wait for thread trace completion. */
343          radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
344          radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
345          radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
346          radeon_emit(0);
347          radeon_emit(0); /* reference value */
348          radeon_emit(~C_008D20_BUSY); /* mask */
349          radeon_emit(4); /* poll interval */
350       } else {
351          /* Disable the thread trace mode. */
352          radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
353                                 S_030CD8_MODE(0));
354 
355          /* Wait for thread trace completion. */
356          radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
357          radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
358          radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
359          radeon_emit(0);
360          radeon_emit(0); /* reference value */
361          radeon_emit(~C_030CE8_BUSY); /* mask */
362          radeon_emit(4); /* poll interval */
363       }
364       radeon_end();
365 
366       si_copy_thread_trace_info_regs(sctx, cs, se);
367    }
368 
369    /* Restore global broadcasting. */
370    radeon_begin_again(cs);
371    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
372                           S_030800_SE_BROADCAST_WRITES(1) |
373                              S_030800_SH_BROADCAST_WRITES(1) |
374                              S_030800_INSTANCE_BROADCAST_WRITES(1));
375    radeon_end();
376 }
377 
378 static void
si_thread_trace_start(struct si_context * sctx,int family,struct radeon_cmdbuf * cs)379 si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
380 {
381    struct radeon_winsys *ws = sctx->ws;
382 
383    radeon_begin(cs);
384 
385    switch (family) {
386       case AMD_IP_GFX:
387          radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
388          radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
389          radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
390          break;
391       case AMD_IP_COMPUTE:
392          radeon_emit(PKT3(PKT3_NOP, 0, 0));
393          radeon_emit(0);
394          break;
395    }
396    radeon_end();
397 
398    ws->cs_add_buffer(cs,
399                      sctx->thread_trace->bo,
400                      RADEON_USAGE_READWRITE,
401                      RADEON_DOMAIN_VRAM);
402    ws->cs_add_buffer(cs,
403                      sctx->spm_trace.bo,
404                      RADEON_USAGE_READWRITE,
405                      RADEON_DOMAIN_VRAM);
406 
407    si_cp_dma_wait_for_idle(sctx, cs);
408 
409    /* Make sure to wait-for-idle before starting SQTT. */
410    sctx->flags |=
411       SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
412       SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
413       SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
414    sctx->emit_cache_flush(sctx, cs);
415 
416    si_inhibit_clockgating(sctx, cs, true);
417 
418    /* Enable SQG events that collects thread trace data. */
419    si_emit_spi_config_cntl(sctx, cs, true);
420 
421    si_pc_emit_spm_reset(cs);
422 
423    si_pc_emit_shaders(cs, 0x7f);
424 
425    si_emit_spm_setup(sctx, cs);
426 
427    si_emit_thread_trace_start(sctx, cs, family);
428 
429    si_pc_emit_spm_start(cs);
430 }
431 
432 static void
si_thread_trace_stop(struct si_context * sctx,int family,struct radeon_cmdbuf * cs)433 si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
434 {
435    struct radeon_winsys *ws = sctx->ws;
436 
437    radeon_begin(cs);
438 
439    switch (family) {
440       case AMD_IP_GFX:
441          radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
442          radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
443          radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
444          break;
445       case AMD_IP_COMPUTE:
446          radeon_emit(PKT3(PKT3_NOP, 0, 0));
447          radeon_emit(0);
448          break;
449    }
450    radeon_end();
451 
452    ws->cs_add_buffer(cs,
453                      sctx->thread_trace->bo,
454                      RADEON_USAGE_READWRITE,
455                      RADEON_DOMAIN_VRAM);
456 
457    ws->cs_add_buffer(cs,
458                      sctx->spm_trace.bo,
459                      RADEON_USAGE_READWRITE,
460                      RADEON_DOMAIN_VRAM);
461 
462    si_cp_dma_wait_for_idle(sctx, cs);
463 
464    si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters,
465                        sctx->screen->info.never_send_perfcounter_stop);
466 
467    /* Make sure to wait-for-idle before stopping SQTT. */
468    sctx->flags |=
469       SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
470       SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
471       SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
472    sctx->emit_cache_flush(sctx, cs);
473 
474    si_emit_thread_trace_stop(sctx, cs, family);
475 
476    si_pc_emit_spm_reset(cs);
477 
478    /* Restore previous state by disabling SQG events. */
479    si_emit_spi_config_cntl(sctx, cs, false);
480 
481    si_inhibit_clockgating(sctx, cs, false);
482 }
483 
484 
485 static void
si_thread_trace_init_cs(struct si_context * sctx)486 si_thread_trace_init_cs(struct si_context *sctx)
487 {
488    struct radeon_winsys *ws = sctx->ws;
489 
490    /* Thread trace start CS (only handles AMD_IP_GFX). */
491    sctx->thread_trace->start_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
492    if (!ws->cs_create(sctx->thread_trace->start_cs[AMD_IP_GFX],
493                       sctx->ctx, AMD_IP_GFX, NULL, NULL, 0)) {
494       free(sctx->thread_trace->start_cs[AMD_IP_GFX]);
495       sctx->thread_trace->start_cs[AMD_IP_GFX] = NULL;
496       return;
497    }
498 
499    si_thread_trace_start(sctx, AMD_IP_GFX, sctx->thread_trace->start_cs[AMD_IP_GFX]);
500 
501    /* Thread trace stop CS. */
502    sctx->thread_trace->stop_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
503    if (!ws->cs_create(sctx->thread_trace->stop_cs[AMD_IP_GFX],
504                       sctx->ctx, AMD_IP_GFX, NULL, NULL, 0)) {
505       free(sctx->thread_trace->start_cs[AMD_IP_GFX]);
506       sctx->thread_trace->start_cs[AMD_IP_GFX] = NULL;
507       free(sctx->thread_trace->stop_cs[AMD_IP_GFX]);
508       sctx->thread_trace->stop_cs[AMD_IP_GFX] = NULL;
509       return;
510    }
511 
512    si_thread_trace_stop(sctx, AMD_IP_GFX, sctx->thread_trace->stop_cs[AMD_IP_GFX]);
513 }
514 
515 static void
si_begin_thread_trace(struct si_context * sctx,struct radeon_cmdbuf * rcs)516 si_begin_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
517 {
518    struct radeon_cmdbuf *cs = sctx->thread_trace->start_cs[AMD_IP_GFX];
519    sctx->ws->cs_flush(cs, 0, NULL);
520 }
521 
522 static void
si_end_thread_trace(struct si_context * sctx,struct radeon_cmdbuf * rcs)523 si_end_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
524 {
525    struct radeon_cmdbuf *cs = sctx->thread_trace->stop_cs[AMD_IP_GFX];
526    sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
527 }
528 
529 static bool
si_get_thread_trace(struct si_context * sctx,struct ac_thread_trace * thread_trace)530 si_get_thread_trace(struct si_context *sctx,
531                     struct ac_thread_trace *thread_trace)
532 {
533    unsigned max_se = sctx->screen->info.max_se;
534 
535    memset(thread_trace, 0, sizeof(*thread_trace));
536    thread_trace->num_traces = max_se;
537 
538    sctx->thread_trace->ptr = sctx->ws->buffer_map(sctx->ws, sctx->thread_trace->bo,
539                                                           NULL,
540                                                           PIPE_MAP_READ);
541 
542    if (!sctx->thread_trace->ptr)
543       return false;
544 
545    void *thread_trace_ptr = sctx->thread_trace->ptr;
546 
547    for (unsigned se = 0; se < max_se; se++) {
548       uint64_t info_offset = ac_thread_trace_get_info_offset(se);
549       uint64_t data_offset = ac_thread_trace_get_data_offset(&sctx->screen->info, sctx->thread_trace, se);
550       void *info_ptr = thread_trace_ptr + info_offset;
551       void *data_ptr = thread_trace_ptr + data_offset;
552       struct ac_thread_trace_info *info =
553          (struct ac_thread_trace_info *)info_ptr;
554 
555       struct ac_thread_trace_se thread_trace_se = {0};
556 
557       if (!ac_is_thread_trace_complete(&sctx->screen->info, sctx->thread_trace, info)) {
558          uint32_t expected_size =
559             ac_get_expected_buffer_size(&sctx->screen->info, info);
560          uint32_t available_size = (info->cur_offset * 32) / 1024;
561 
562          fprintf(stderr, "Failed to get the thread trace "
563                  "because the buffer is too small. The "
564                  "hardware needs %d KB but the "
565                  "buffer size is %d KB.\n",
566                  expected_size, available_size);
567          fprintf(stderr, "Please update the buffer size with "
568                  "AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n");
569          return false;
570       }
571 
572       thread_trace_se.data_ptr = data_ptr;
573       thread_trace_se.info = *info;
574       thread_trace_se.shader_engine = se;
575 
576       int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
577 
578       /* For GFX10+ compute_unit really means WGP */
579       thread_trace_se.compute_unit =
580          sctx->screen->info.gfx_level >= GFX10 ? (first_active_cu / 2) : first_active_cu;
581 
582       thread_trace->traces[se] = thread_trace_se;
583    }
584 
585    thread_trace->data = sctx->thread_trace;
586    return true;
587 }
588 
589 
590 bool
si_init_thread_trace(struct si_context * sctx)591 si_init_thread_trace(struct si_context *sctx)
592 {
593    static bool warn_once = true;
594    if (warn_once) {
595       fprintf(stderr, "*************************************************\n");
596       fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
597       fprintf(stderr, "*************************************************\n");
598       warn_once = false;
599    }
600 
601    sctx->thread_trace = CALLOC_STRUCT(ac_thread_trace_data);
602 
603    if (sctx->gfx_level < GFX8) {
604       fprintf(stderr, "GPU hardware not supported: refer to "
605               "the RGP documentation for the list of "
606               "supported GPUs!\n");
607       return false;
608    }
609 
610    if (sctx->gfx_level > GFX10_3) {
611       fprintf(stderr, "radeonsi: Thread trace is not supported "
612               "for that GPU!\n");
613       return false;
614    }
615 
616    /* Default buffer size set to 32MB per SE. */
617    sctx->thread_trace->buffer_size = debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
618    sctx->thread_trace->start_frame = 10;
619 
620    const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
621    if (trigger) {
622       sctx->thread_trace->start_frame = atoi(trigger);
623       if (sctx->thread_trace->start_frame <= 0) {
624          /* This isn't a frame number, must be a file */
625          sctx->thread_trace->trigger_file = strdup(trigger);
626          sctx->thread_trace->start_frame = -1;
627       }
628    }
629 
630    if (!si_thread_trace_init_bo(sctx))
631       return false;
632 
633    list_inithead(&sctx->thread_trace->rgp_pso_correlation.record);
634    simple_mtx_init(&sctx->thread_trace->rgp_pso_correlation.lock, mtx_plain);
635 
636    list_inithead(&sctx->thread_trace->rgp_loader_events.record);
637    simple_mtx_init(&sctx->thread_trace->rgp_loader_events.lock, mtx_plain);
638 
639    list_inithead(&sctx->thread_trace->rgp_code_object.record);
640    simple_mtx_init(&sctx->thread_trace->rgp_code_object.lock, mtx_plain);
641 
642    if (sctx->gfx_level >= GFX10) {
643       /* Limit SPM counters to GFX10+ for now */
644       ASSERTED bool r = si_spm_init(sctx);
645       assert(r);
646    }
647 
648    si_thread_trace_init_cs(sctx);
649 
650    sctx->sqtt_next_event = EventInvalid;
651 
652    return true;
653 }
654 
655 void
si_destroy_thread_trace(struct si_context * sctx)656 si_destroy_thread_trace(struct si_context *sctx)
657 {
658    struct si_screen *sscreen = sctx->screen;
659    struct pb_buffer *bo = sctx->thread_trace->bo;
660    radeon_bo_reference(sctx->screen->ws, &bo, NULL);
661 
662    if (sctx->thread_trace->trigger_file)
663       free(sctx->thread_trace->trigger_file);
664 
665    sscreen->ws->cs_destroy(sctx->thread_trace->start_cs[AMD_IP_GFX]);
666    sscreen->ws->cs_destroy(sctx->thread_trace->stop_cs[AMD_IP_GFX]);
667 
668    struct rgp_pso_correlation *pso_correlation = &sctx->thread_trace->rgp_pso_correlation;
669    struct rgp_loader_events *loader_events = &sctx->thread_trace->rgp_loader_events;
670    struct rgp_code_object *code_object = &sctx->thread_trace->rgp_code_object;
671    list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
672                             &pso_correlation->record, list) {
673       list_del(&record->list);
674       free(record);
675    }
676    simple_mtx_destroy(&sctx->thread_trace->rgp_pso_correlation.lock);
677 
678    list_for_each_entry_safe(struct rgp_loader_events_record, record,
679                             &loader_events->record, list) {
680       list_del(&record->list);
681       free(record);
682    }
683    simple_mtx_destroy(&sctx->thread_trace->rgp_loader_events.lock);
684 
685    list_for_each_entry_safe(struct rgp_code_object_record, record,
686              &code_object->record, list) {
687       uint32_t mask = record->shader_stages_mask;
688       int i;
689 
690       /* Free the disassembly. */
691       while (mask) {
692          i = u_bit_scan(&mask);
693          free(record->shader_data[i].code);
694       }
695       list_del(&record->list);
696       free(record);
697    }
698    simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock);
699 
700    free(sctx->thread_trace);
701    sctx->thread_trace = NULL;
702 
703    if (sctx->gfx_level >= GFX10)
704       si_spm_finish(sctx);
705 }
706 
707 static uint64_t num_frames = 0;
708 
709 void
si_handle_thread_trace(struct si_context * sctx,struct radeon_cmdbuf * rcs)710 si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
711 {
712    /* Should we enable SQTT yet? */
713    if (!sctx->thread_trace_enabled) {
714       bool frame_trigger = num_frames == sctx->thread_trace->start_frame;
715       bool file_trigger = false;
716       if (sctx->thread_trace->trigger_file &&
717           access(sctx->thread_trace->trigger_file, W_OK) == 0) {
718          if (unlink(sctx->thread_trace->trigger_file) == 0) {
719             file_trigger = true;
720          } else {
721             /* Do not enable tracing if we cannot remove the file,
722              * because by then we'll trace every frame.
723              */
724             fprintf(stderr, "radeonsi: could not remove thread trace trigger file, ignoring\n");
725          }
726       }
727 
728       if (frame_trigger || file_trigger) {
729          /* Wait for last submission */
730          sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, PIPE_TIMEOUT_INFINITE);
731 
732          /* Start SQTT */
733          si_begin_thread_trace(sctx, rcs);
734 
735          sctx->thread_trace_enabled = true;
736          sctx->thread_trace->start_frame = -1;
737 
738          /* Force shader update to make sure si_sqtt_describe_pipeline_bind is called
739           * for the current "pipeline".
740           */
741          sctx->do_update_shaders = true;
742       }
743    } else {
744       struct ac_thread_trace thread_trace = {0};
745 
746       /* Stop SQTT */
747       si_end_thread_trace(sctx, rcs);
748       sctx->thread_trace_enabled = false;
749       sctx->thread_trace->start_frame = -1;
750       assert (sctx->last_sqtt_fence);
751 
752       /* Wait for SQTT to finish and read back the bo */
753       if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, PIPE_TIMEOUT_INFINITE) &&
754           si_get_thread_trace(sctx, &thread_trace)) {
755          /* Map the SPM counter buffer */
756          if (sctx->gfx_level >= GFX10)
757             sctx->spm_trace.ptr = sctx->ws->buffer_map(sctx->ws, sctx->spm_trace.bo,
758                                                        NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY);
759 
760          ac_dump_rgp_capture(&sctx->screen->info, &thread_trace, &sctx->spm_trace);
761 
762          if (sctx->spm_trace.ptr)
763             sctx->ws->buffer_unmap(sctx->ws, sctx->spm_trace.bo);
764       } else {
765          fprintf(stderr, "Failed to read the trace\n");
766       }
767    }
768 
769    num_frames++;
770 }
771 
772 
773 static void
si_emit_thread_trace_userdata(struct si_context * sctx,struct radeon_cmdbuf * cs,const void * data,uint32_t num_dwords)774 si_emit_thread_trace_userdata(struct si_context* sctx,
775                               struct radeon_cmdbuf *cs,
776                               const void *data, uint32_t num_dwords)
777 {
778    const uint32_t *dwords = (uint32_t *)data;
779 
780    radeon_begin(cs);
781 
782    while (num_dwords > 0) {
783       uint32_t count = MIN2(num_dwords, 2);
784 
785       /* Without the perfctr bit the CP might not always pass the
786        * write on correctly. */
787       radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->gfx_level >= GFX10);
788 
789       radeon_emit_array(dwords, count);
790 
791       dwords += count;
792       num_dwords -= count;
793    }
794    radeon_end();
795 }
796 
797 static void
si_emit_spi_config_cntl(struct si_context * sctx,struct radeon_cmdbuf * cs,bool enable)798 si_emit_spi_config_cntl(struct si_context* sctx,
799            struct radeon_cmdbuf *cs, bool enable)
800 {
801    radeon_begin(cs);
802 
803    if (sctx->gfx_level >= GFX9) {
804       uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
805                                  S_031100_EXP_PRIORITY_ORDER(3) |
806                                  S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
807                                  S_031100_ENABLE_SQG_BOP_EVENTS(enable);
808 
809       if (sctx->gfx_level >= GFX10)
810          spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
811 
812       radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
813    } else {
814       /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
815       radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
816                                        S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
817                                        S_009100_ENABLE_SQG_BOP_EVENTS(enable));
818    }
819    radeon_end();
820 }
821 
822 static uint32_t num_events = 0;
823 void
si_sqtt_write_event_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t vertex_offset_user_data,uint32_t instance_offset_user_data,uint32_t draw_index_user_data)824 si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,
825                            enum rgp_sqtt_marker_event_type api_type,
826                            uint32_t vertex_offset_user_data,
827                            uint32_t instance_offset_user_data,
828                            uint32_t draw_index_user_data)
829 {
830    struct rgp_sqtt_marker_event marker = {0};
831 
832    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
833    marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
834    marker.cmd_id = num_events++;
835    marker.cb_id = 0;
836 
837    if (vertex_offset_user_data == UINT_MAX ||
838        instance_offset_user_data == UINT_MAX) {
839       vertex_offset_user_data = 0;
840       instance_offset_user_data = 0;
841    }
842 
843    if (draw_index_user_data == UINT_MAX)
844       draw_index_user_data = vertex_offset_user_data;
845 
846    marker.vertex_offset_reg_idx = vertex_offset_user_data;
847    marker.instance_offset_reg_idx = instance_offset_user_data;
848    marker.draw_index_reg_idx = draw_index_user_data;
849 
850    si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
851 
852    sctx->sqtt_next_event = EventInvalid;
853 }
854 
855 void
si_write_event_with_dims_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t x,uint32_t y,uint32_t z)856 si_write_event_with_dims_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,
857                                 enum rgp_sqtt_marker_event_type api_type,
858                                 uint32_t x, uint32_t y, uint32_t z)
859 {
860    struct rgp_sqtt_marker_event_with_dims marker = {0};
861 
862    marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
863    marker.event.api_type = api_type;
864    marker.event.cmd_id = num_events++;
865    marker.event.cb_id = 0;
866    marker.event.has_thread_dims = 1;
867 
868    marker.thread_x = x;
869    marker.thread_y = y;
870    marker.thread_z = z;
871 
872    si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
873    sctx->sqtt_next_event = EventInvalid;
874 }
875 
876 void
si_sqtt_describe_barrier_start(struct si_context * sctx,struct radeon_cmdbuf * rcs)877 si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rcs)
878 {
879    struct rgp_sqtt_marker_barrier_start marker = {0};
880 
881    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
882    marker.cb_id = 0;
883    marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
884 
885    si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
886 }
887 
888 void
si_sqtt_describe_barrier_end(struct si_context * sctx,struct radeon_cmdbuf * rcs,unsigned flags)889 si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs,
890                             unsigned flags)
891 {
892    struct rgp_sqtt_marker_barrier_end marker = {0};
893 
894    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
895    marker.cb_id = 0;
896 
897    if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH)
898       marker.vs_partial_flush = true;
899    if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH)
900       marker.ps_partial_flush = true;
901    if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH)
902       marker.cs_partial_flush = true;
903 
904    if (flags & SI_CONTEXT_PFP_SYNC_ME)
905       marker.pfp_sync_me = true;
906 
907    if (flags & SI_CONTEXT_INV_VCACHE)
908       marker.inval_tcp = true;
909    if (flags & SI_CONTEXT_INV_ICACHE)
910       marker.inval_sqI = true;
911    if (flags & SI_CONTEXT_INV_SCACHE)
912       marker.inval_sqK = true;
913    if (flags & SI_CONTEXT_INV_L2)
914       marker.inval_tcc = true;
915 
916    if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
917       marker.inval_cb = true;
918       marker.flush_cb = true;
919    }
920    if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
921       marker.inval_db = true;
922       marker.flush_db = true;
923    }
924 
925    si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
926 }
927 
928 void
si_write_user_event(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_user_event_type type,const char * str,int len)929 si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs,
930                     enum rgp_sqtt_marker_user_event_type type,
931                     const char *str, int len)
932 {
933    if (type == UserEventPop) {
934       assert (str == NULL);
935       struct rgp_sqtt_marker_user_event marker = { 0 };
936       marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
937       marker.data_type = type;
938 
939       si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
940    } else {
941       assert (str != NULL);
942       struct rgp_sqtt_marker_user_event_with_length marker = { 0 };
943       marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
944       marker.user_event.data_type = type;
945       len = MIN2(1024, len);
946       marker.length = align(len, 4);
947 
948       uint8_t *buffer = alloca(sizeof(marker) + marker.length);
949       memcpy(buffer, &marker, sizeof(marker));
950       memcpy(buffer + sizeof(marker), str, len);
951       buffer[sizeof(marker) + len - 1] = '\0';
952 
953       si_emit_thread_trace_userdata(sctx, rcs, buffer, sizeof(marker) / 4 + marker.length / 4);
954    }
955 }
956 
957 
958 bool
si_sqtt_pipeline_is_registered(struct ac_thread_trace_data * thread_trace_data,uint64_t pipeline_hash)959 si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
960                                uint64_t pipeline_hash)
961 {
962    simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock);
963    list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
964              &thread_trace_data->rgp_pso_correlation.record, list) {
965       if (record->pipeline_hash[0] == pipeline_hash) {
966          simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
967          return true;
968       }
969 
970    }
971    simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
972 
973    return false;
974 }
975 
976 
977 
978 static enum rgp_hardware_stages
si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key * key,enum pipe_shader_type stage)979 si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type stage)
980 {
981    switch (stage) {
982    case PIPE_SHADER_VERTEX:
983       if (key->ge.as_ls)
984          return RGP_HW_STAGE_LS;
985       else if (key->ge.as_es)
986          return RGP_HW_STAGE_ES;
987       else if (key->ge.as_ngg)
988          return RGP_HW_STAGE_GS;
989       else
990          return RGP_HW_STAGE_VS;
991    case PIPE_SHADER_TESS_CTRL:
992       return RGP_HW_STAGE_HS;
993    case PIPE_SHADER_TESS_EVAL:
994       if (key->ge.as_es)
995          return RGP_HW_STAGE_ES;
996       else if (key->ge.as_ngg)
997          return RGP_HW_STAGE_GS;
998       else
999          return RGP_HW_STAGE_VS;
1000    case PIPE_SHADER_GEOMETRY:
1001       return RGP_HW_STAGE_GS;
1002    case PIPE_SHADER_FRAGMENT:
1003       return RGP_HW_STAGE_PS;
1004    case PIPE_SHADER_COMPUTE:
1005       return RGP_HW_STAGE_CS;
1006    default:
1007       unreachable("invalid mesa shader stage");
1008    }
1009 }
1010 
1011 static bool
si_sqtt_add_code_object(struct si_context * sctx,uint64_t pipeline_hash,bool is_compute)1012 si_sqtt_add_code_object(struct si_context* sctx,
1013                         uint64_t pipeline_hash,
1014                         bool is_compute)
1015 {
1016    struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
1017    struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;
1018    struct rgp_code_object_record *record;
1019 
1020    record = malloc(sizeof(struct rgp_code_object_record));
1021    if (!record)
1022       return false;
1023 
1024    record->shader_stages_mask = 0;
1025    record->num_shaders_combined = 0;
1026    record->pipeline_hash[0] = pipeline_hash;
1027    record->pipeline_hash[1] = pipeline_hash;
1028 
1029    for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
1030       struct si_shader *shader;
1031       enum rgp_hardware_stages hw_stage;
1032 
1033       if (is_compute) {
1034          if (i != PIPE_SHADER_COMPUTE)
1035             continue;
1036          shader = &sctx->cs_shader_state.program->shader;
1037          hw_stage = RGP_HW_STAGE_CS;
1038       } else if (i != PIPE_SHADER_COMPUTE) {
1039          if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
1040             continue;
1041          shader = sctx->shaders[i].current;
1042          hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
1043       } else {
1044          continue;
1045       }
1046 
1047       uint8_t *code = malloc(shader->binary.uploaded_code_size);
1048       if (!code) {
1049          free(record);
1050          return false;
1051       }
1052       memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
1053 
1054       uint64_t va = shader->bo->gpu_address;
1055       unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i);
1056       record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
1057       record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0];
1058       record->shader_data[gl_shader_stage].code_size = shader->binary.uploaded_code_size;
1059       record->shader_data[gl_shader_stage].code = code;
1060       record->shader_data[gl_shader_stage].vgpr_count = shader->config.num_vgprs;
1061       record->shader_data[gl_shader_stage].sgpr_count = shader->config.num_sgprs;
1062       record->shader_data[gl_shader_stage].base_address = va & 0xffffffffffff;
1063       record->shader_data[gl_shader_stage].elf_symbol_offset = 0;
1064       record->shader_data[gl_shader_stage].hw_stage = hw_stage;
1065       record->shader_data[gl_shader_stage].is_combined = false;
1066       record->shader_data[gl_shader_stage].scratch_memory_size = shader->config.scratch_bytes_per_wave;
1067       record->shader_data[gl_shader_stage].wavefront_size = shader->wave_size;
1068 
1069       record->shader_stages_mask |= 1 << gl_shader_stage;
1070       record->num_shaders_combined++;
1071    }
1072 
1073    simple_mtx_lock(&code_object->lock);
1074    list_addtail(&record->list, &code_object->record);
1075    code_object->record_count++;
1076    simple_mtx_unlock(&code_object->lock);
1077 
1078    return true;
1079 }
1080 
1081 bool
si_sqtt_register_pipeline(struct si_context * sctx,uint64_t pipeline_hash,uint64_t base_address,bool is_compute)1082 si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute)
1083 {
1084    struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
1085 
1086    assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash));
1087 
1088    bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash);
1089    if (!result)
1090       return false;
1091 
1092    result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address);
1093    if (!result)
1094       return false;
1095 
1096    return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute);
1097 }
1098 
1099 void
si_sqtt_describe_pipeline_bind(struct si_context * sctx,uint64_t pipeline_hash,int bind_point)1100 si_sqtt_describe_pipeline_bind(struct si_context* sctx,
1101                                uint64_t pipeline_hash,
1102                                int bind_point)
1103 {
1104    struct rgp_sqtt_marker_pipeline_bind marker = {0};
1105    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1106 
1107    if (likely(!sctx->thread_trace_enabled)) {
1108       return;
1109    }
1110 
1111    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
1112    marker.cb_id = 0;
1113    marker.bind_point = bind_point;
1114    marker.api_pso_hash[0] = pipeline_hash;
1115    marker.api_pso_hash[1] = pipeline_hash >> 32;
1116 
1117    si_emit_thread_trace_userdata(sctx, cs, &marker, sizeof(marker) / 4);
1118 }
1119