• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "amd_family.h"
8 #include "si_build_pm4.h"
9 #include "si_pipe.h"
10 
11 #include "tgsi/tgsi_from_mesa.h"
12 #include "util/hash_table.h"
13 #include "util/u_debug.h"
14 #include "util/u_memory.h"
15 #include "ac_rgp.h"
16 #include "ac_sqtt.h"
17 
18 static void
19 si_emit_spi_config_cntl(struct si_context *sctx,
20                         struct radeon_cmdbuf *cs, bool enable);
21 
si_sqtt_init_bo(struct si_context * sctx)22 static bool si_sqtt_init_bo(struct si_context *sctx)
23 {
24    const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(&sctx->screen->info);
25    unsigned max_se = sctx->screen->info.max_se;
26    struct radeon_winsys *ws = sctx->ws;
27    uint64_t size;
28 
29    /* The buffer size and address need to be aligned in HW regs. Align the
30     * size as early as possible so that we do all the allocation & addressing
31     * correctly. */
32    sctx->sqtt->buffer_size =
33       align64(sctx->sqtt->buffer_size, 1ull << align_shift);
34 
35    /* Compute total size of the thread trace BO for all SEs. */
36    size = align64(sizeof(struct ac_sqtt_data_info) * max_se,
37                   1ull << align_shift);
38    size += sctx->sqtt->buffer_size * (uint64_t)max_se;
39 
40    sctx->sqtt->bo =
41       ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_GTT,
42                         RADEON_FLAG_NO_INTERPROCESS_SHARING |
43                            RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_SUBALLOC);
44    if (!sctx->sqtt->bo)
45       return false;
46 
47    sctx->sqtt->buffer_va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
48 
49    return true;
50 }
51 
si_emit_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)52 static void si_emit_sqtt_start(struct si_context *sctx,
53                                struct radeon_cmdbuf *cs,
54                                enum amd_ip_type ip_type)
55 {
56    struct si_screen *sscreen = sctx->screen;
57    const bool is_compute_queue = ip_type == AMD_IP_COMPUTE;
58    struct ac_pm4_state *pm4;
59 
60    pm4 = ac_pm4_create_sized(&sscreen->info, false, 512, is_compute_queue);
61    if (!pm4)
62       return;
63 
64    ac_sqtt_emit_start(&sscreen->info, pm4, sctx->sqtt, is_compute_queue);
65    ac_pm4_finalize(pm4);
66 
67    radeon_begin(cs);
68    radeon_emit_array(pm4->pm4, pm4->ndw);
69    radeon_end();
70 
71    ac_pm4_free_state(pm4);
72 }
73 
si_emit_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)74 static void si_emit_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs,
75                               enum amd_ip_type ip_type)
76 {
77    struct si_screen *sscreen = sctx->screen;
78    const bool is_compute_queue = ip_type == AMD_IP_COMPUTE;
79    struct ac_pm4_state *pm4;
80 
81    pm4 = ac_pm4_create_sized(&sscreen->info, false, 512, is_compute_queue);
82    if (!pm4)
83       return;
84 
85    ac_sqtt_emit_stop(&sscreen->info, pm4, is_compute_queue);
86    ac_pm4_finalize(pm4);
87 
88    radeon_begin(cs);
89    radeon_emit_array(pm4->pm4, pm4->ndw);
90    radeon_end();
91 
92    ac_pm4_clear_state(pm4, &sscreen->info, false, is_compute_queue);
93 
94    if (sctx->screen->info.has_sqtt_rb_harvest_bug) {
95       /* Some chips with disabled RBs should wait for idle because FINISH_DONE
96        * doesn't work. */
97       sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB |
98                              SI_BARRIER_SYNC_CS;
99       sctx->emit_barrier(sctx, cs);
100    }
101 
102    ac_sqtt_emit_wait(&sscreen->info, pm4, sctx->sqtt, is_compute_queue);
103    ac_pm4_finalize(pm4);
104 
105    radeon_begin_again(cs);
106    radeon_emit_array(pm4->pm4, pm4->ndw);
107    radeon_end();
108 
109    ac_pm4_free_state(pm4);
110 }
111 
si_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs)112 static void si_sqtt_start(struct si_context *sctx, struct radeon_cmdbuf *cs)
113 {
114    struct radeon_winsys *ws = sctx->ws;
115    enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
116 
117    radeon_begin(cs);
118 
119    switch (ip_type) {
120       case AMD_IP_GFX:
121          radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
122          radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
123          radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
124          break;
125       case AMD_IP_COMPUTE:
126          radeon_emit(PKT3(PKT3_NOP, 0, 0));
127          radeon_emit(0);
128          break;
129       default:
130         /* Unsupported. */
131         assert(false);
132    }
133    radeon_end();
134 
135    ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
136                      RADEON_DOMAIN_VRAM);
137    if (sctx->spm.bo)
138       ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
139                         RADEON_DOMAIN_VRAM);
140 
141    si_cp_dma_wait_for_idle(sctx, cs);
142 
143    /* Make sure to wait-for-idle before starting SQTT. */
144    sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS |
145                           SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM |
146                           SI_BARRIER_INV_VMEM | SI_BARRIER_INV_L2 |
147                           SI_BARRIER_PFP_SYNC_ME;
148    sctx->emit_barrier(sctx, cs);
149 
150    si_inhibit_clockgating(sctx, cs, true);
151 
152    /* Enable SQG events that collects thread trace data. */
153    si_emit_spi_config_cntl(sctx, cs, true);
154 
155    if (sctx->spm.bo) {
156       si_pc_emit_spm_reset(cs);
157       si_pc_emit_shaders(cs, ac_sqtt_get_shader_mask(&sctx->screen->info));
158       si_emit_spm_setup(sctx, cs);
159    }
160 
161    si_emit_sqtt_start(sctx, cs, ip_type);
162 
163    if (sctx->spm.bo)
164       si_pc_emit_spm_start(cs);
165 }
166 
si_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs)167 static void si_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs)
168 {
169    struct radeon_winsys *ws = sctx->ws;
170    enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
171 
172    radeon_begin(cs);
173 
174    switch (ip_type) {
175       case AMD_IP_GFX:
176          radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
177          radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
178          radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
179          break;
180       case AMD_IP_COMPUTE:
181          radeon_emit(PKT3(PKT3_NOP, 0, 0));
182          radeon_emit(0);
183          break;
184       default:
185         /* Unsupported. */
186         assert(false);
187    }
188    radeon_end();
189 
190    ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
191                      RADEON_DOMAIN_VRAM);
192 
193    if (sctx->spm.bo)
194       ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
195                         RADEON_DOMAIN_VRAM);
196 
197    si_cp_dma_wait_for_idle(sctx, cs);
198 
199    if (sctx->spm.bo)
200       si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters,
201                           sctx->screen->info.never_send_perfcounter_stop);
202 
203    /* Make sure to wait-for-idle before stopping SQTT. */
204    sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS |
205                           SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM |
206                           SI_BARRIER_INV_VMEM | SI_BARRIER_INV_L2 |
207                           SI_BARRIER_PFP_SYNC_ME;
208    sctx->emit_barrier(sctx, cs);
209 
210    si_emit_sqtt_stop(sctx, cs, ip_type);
211 
212    if (sctx->spm.bo)
213       si_pc_emit_spm_reset(cs);
214 
215    /* Restore previous state by disabling SQG events. */
216    si_emit_spi_config_cntl(sctx, cs, false);
217 
218    si_inhibit_clockgating(sctx, cs, false);
219 }
220 
si_sqtt_init_cs(struct si_context * sctx)221 static void si_sqtt_init_cs(struct si_context *sctx)
222 {
223    struct radeon_winsys *ws = sctx->ws;
224 
225    for (unsigned i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
226       sctx->sqtt->start_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
227       if (!ws->cs_create(sctx->sqtt->start_cs[i], sctx->ctx, (enum amd_ip_type)i,
228                          NULL, NULL)) {
229          free(sctx->sqtt->start_cs[i]);
230          sctx->sqtt->start_cs[i] = NULL;
231          return;
232       }
233       si_sqtt_start(sctx, sctx->sqtt->start_cs[i]);
234 
235       sctx->sqtt->stop_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
236       if (!ws->cs_create(sctx->sqtt->stop_cs[i], sctx->ctx, (enum amd_ip_type)i,
237                          NULL, NULL)) {
238          ws->cs_destroy(sctx->sqtt->start_cs[i]);
239          free(sctx->sqtt->start_cs[i]);
240          sctx->sqtt->start_cs[i] = NULL;
241          free(sctx->sqtt->stop_cs[i]);
242          sctx->sqtt->stop_cs[i] = NULL;
243          return;
244       }
245 
246       si_sqtt_stop(sctx, sctx->sqtt->stop_cs[i]);
247    }
248 }
249 
si_begin_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)250 static void si_begin_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
251 {
252    struct radeon_cmdbuf *cs = sctx->sqtt->start_cs[sctx->ws->cs_get_ip_type(rcs)];
253    sctx->ws->cs_flush(cs, 0, NULL);
254 }
255 
si_end_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)256 static void si_end_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
257 {
258    struct radeon_cmdbuf *cs = sctx->sqtt->stop_cs[sctx->ws->cs_get_ip_type(rcs)];
259    sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
260 }
261 
262 static bool
si_sqtt_resize_bo(struct si_context * sctx)263 si_sqtt_resize_bo(struct si_context *sctx)
264 {
265    /* Destroy the previous thread trace BO. */
266    struct pb_buffer_lean *bo = sctx->sqtt->bo;
267    radeon_bo_reference(sctx->screen->ws, &bo, NULL);
268 
269    /* Double the size of the thread trace buffer per SE. */
270    sctx->sqtt->buffer_size *= 2;
271 
272    fprintf(stderr,
273            "Failed to get the thread trace because the buffer "
274            "was too small, resizing to %d KB\n",
275            sctx->sqtt->buffer_size / 1024);
276 
277    /* Re-create the thread trace BO. */
278    return si_sqtt_init_bo(sctx);
279 }
280 
si_get_sqtt_trace(struct si_context * sctx,struct ac_sqtt_trace * sqtt)281 static bool si_get_sqtt_trace(struct si_context *sctx,
282                               struct ac_sqtt_trace *sqtt)
283 {
284    memset(sqtt, 0, sizeof(*sqtt));
285 
286    sctx->sqtt->ptr =
287       sctx->ws->buffer_map(sctx->ws, sctx->sqtt->bo, NULL, PIPE_MAP_READ);
288 
289    if (!sctx->sqtt->ptr)
290       return false;
291 
292    if (!ac_sqtt_get_trace(sctx->sqtt, &sctx->screen->info, sqtt)) {
293       if (!si_sqtt_resize_bo(sctx)) {
294          fprintf(stderr, "radeonsi: Failed to resize the SQTT buffer.\n");
295       } else {
296          for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
297             sctx->screen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
298             sctx->screen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
299          }
300          si_sqtt_init_cs(sctx);
301       }
302       return false;
303    }
304    return true;
305 }
306 
si_init_sqtt(struct si_context * sctx)307 bool si_init_sqtt(struct si_context *sctx)
308 {
309    static bool warn_once = true;
310    if (warn_once) {
311       fprintf(stderr, "*************************************************\n");
312       fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
313       fprintf(stderr, "*************************************************\n");
314       warn_once = false;
315    }
316 
317    sctx->sqtt = CALLOC_STRUCT(ac_sqtt);
318 
319    if (sctx->gfx_level < GFX8) {
320       fprintf(stderr, "GPU hardware not supported: refer to "
321                       "the RGP documentation for the list of "
322                       "supported GPUs!\n");
323       return false;
324    }
325 
326    if (sctx->gfx_level > GFX11) {
327       fprintf(stderr, "radeonsi: Thread trace is not supported "
328                       "for that GPU!\n");
329       return false;
330    }
331 
332    /* Default buffer size set to 32MB per SE. */
333    sctx->sqtt->buffer_size =
334       debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
335    sctx->sqtt->instruction_timing_enabled =
336       debug_get_bool_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING", true);
337    sctx->sqtt->start_frame = 10;
338 
339    const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
340    if (trigger) {
341       sctx->sqtt->start_frame = atoi(trigger);
342       if (sctx->sqtt->start_frame <= 0) {
343          /* This isn't a frame number, must be a file */
344          sctx->sqtt->trigger_file = strdup(trigger);
345          sctx->sqtt->start_frame = -1;
346       }
347    }
348 
349    if (!si_sqtt_init_bo(sctx))
350       return false;
351 
352    sctx->sqtt->pipeline_bos = _mesa_hash_table_u64_create(NULL);
353 
354    ac_sqtt_init(sctx->sqtt);
355 
356    if (sctx->gfx_level >= GFX10 &&
357        debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) {
358       /* Limit SPM counters to GFX10 and GFX10_3 for now */
359       ASSERTED bool r = si_spm_init(sctx);
360       assert(r);
361    }
362 
363    si_sqtt_init_cs(sctx);
364 
365    sctx->sqtt_next_event = EventInvalid;
366 
367    return true;
368 }
369 
si_destroy_sqtt(struct si_context * sctx)370 void si_destroy_sqtt(struct si_context *sctx)
371 {
372    struct si_screen *sscreen = sctx->screen;
373    struct pb_buffer_lean *bo = sctx->sqtt->bo;
374    radeon_bo_reference(sctx->screen->ws, &bo, NULL);
375 
376    if (sctx->sqtt->trigger_file)
377       free(sctx->sqtt->trigger_file);
378 
379    for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
380       sscreen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
381       sscreen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
382    }
383 
384    struct rgp_pso_correlation *pso_correlation =
385       &sctx->sqtt->rgp_pso_correlation;
386    struct rgp_loader_events *loader_events = &sctx->sqtt->rgp_loader_events;
387    struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
388    list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
389                              &pso_correlation->record, list) {
390       list_del(&record->list);
391       pso_correlation->record_count--;
392       free(record);
393    }
394 
395    list_for_each_entry_safe (struct rgp_loader_events_record, record,
396                              &loader_events->record, list) {
397       list_del(&record->list);
398       loader_events->record_count--;
399       free(record);
400    }
401 
402    list_for_each_entry_safe (struct rgp_code_object_record, record,
403                              &code_object->record, list) {
404       uint32_t mask = record->shader_stages_mask;
405       int i;
406 
407       /* Free the disassembly. */
408       while (mask) {
409          i = u_bit_scan(&mask);
410          free(record->shader_data[i].code);
411       }
412       list_del(&record->list);
413       free(record);
414       code_object->record_count--;
415    }
416 
417    ac_sqtt_finish(sctx->sqtt);
418 
419    hash_table_foreach (sctx->sqtt->pipeline_bos->table, entry) {
420       struct si_sqtt_fake_pipeline *pipeline =
421          (struct si_sqtt_fake_pipeline *)entry->data;
422       si_resource_reference(&pipeline->bo, NULL);
423       FREE(pipeline);
424    }
425 
426    free(sctx->sqtt);
427    sctx->sqtt = NULL;
428 
429    if (sctx->spm.bo)
430       si_spm_finish(sctx);
431 }
432 
433 static uint64_t num_frames = 0;
434 
si_handle_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)435 void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
436 {
437    /* Should we enable SQTT yet? */
438    if (!sctx->sqtt_enabled) {
439       bool frame_trigger = num_frames == sctx->sqtt->start_frame;
440       bool file_trigger = false;
441       if (sctx->sqtt->trigger_file &&
442           access(sctx->sqtt->trigger_file, W_OK) == 0) {
443          if (unlink(sctx->sqtt->trigger_file) == 0) {
444             file_trigger = true;
445          } else {
446             /* Do not enable tracing if we cannot remove the file,
447              * because by then we'll trace every frame.
448              */
449             fprintf(stderr, "radeonsi: could not remove thread "
450                             "trace trigger file, ignoring\n");
451          }
452       }
453 
454       if (frame_trigger || file_trigger) {
455          /* Wait for last submission */
456          sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence,
457                               OS_TIMEOUT_INFINITE);
458 
459          /* Start SQTT */
460          si_begin_sqtt(sctx, rcs);
461 
462          sctx->sqtt_enabled = true;
463          sctx->sqtt->start_frame = -1;
464 
465          /* Force shader update to make sure si_sqtt_describe_pipeline_bind is
466           * called for the current "pipeline".
467           */
468          sctx->do_update_shaders = true;
469       }
470    } else {
471       struct ac_sqtt_trace sqtt_trace = {0};
472 
473       /* Stop SQTT */
474       si_end_sqtt(sctx, rcs);
475       sctx->sqtt_enabled = false;
476       sctx->sqtt->start_frame = -1;
477       assert(sctx->last_sqtt_fence);
478 
479       /* Wait for SQTT to finish and read back the bo */
480       if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence,
481                                OS_TIMEOUT_INFINITE) &&
482           si_get_sqtt_trace(sctx, &sqtt_trace)) {
483          struct ac_spm_trace spm_trace;
484 
485          /* Map the SPM counter buffer */
486          if (sctx->spm.bo) {
487             sctx->spm.ptr = sctx->ws->buffer_map(
488                sctx->ws, sctx->spm.bo, NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY);
489             ac_spm_get_trace(&sctx->spm, &spm_trace);
490          }
491 
492          ac_dump_rgp_capture(&sctx->screen->info, &sqtt_trace,
493                              sctx->spm.bo ? &spm_trace : NULL);
494 
495          if (sctx->spm.ptr)
496             sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo);
497       } else {
498          fprintf(stderr, "Failed to read the trace\n");
499          if (!sctx->sqtt->trigger_file) {
500             sctx->sqtt->start_frame = num_frames + 10;
501          }
502       }
503    }
504 
505    num_frames++;
506 }
507 
si_emit_sqtt_userdata(struct si_context * sctx,struct radeon_cmdbuf * cs,const void * data,uint32_t num_dwords)508 static void si_emit_sqtt_userdata(struct si_context *sctx,
509                                   struct radeon_cmdbuf *cs, const void *data,
510                                   uint32_t num_dwords)
511 {
512    const uint32_t *dwords = (uint32_t *)data;
513 
514    radeon_begin(cs);
515 
516    while (num_dwords > 0) {
517       uint32_t count = MIN2(num_dwords, 2);
518 
519       radeon_set_uconfig_perfctr_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
520       radeon_emit_array(dwords, count);
521 
522       dwords += count;
523       num_dwords -= count;
524    }
525    radeon_end();
526 }
527 
528 static void
si_emit_spi_config_cntl(struct si_context * sctx,struct radeon_cmdbuf * cs,bool enable)529 si_emit_spi_config_cntl(struct si_context *sctx,
530                         struct radeon_cmdbuf *cs, bool enable)
531 {
532    radeon_begin(cs);
533 
534    if (sctx->gfx_level >= GFX9) {
535       uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
536                                  S_031100_EXP_PRIORITY_ORDER(3) |
537                                  S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
538                                  S_031100_ENABLE_SQG_BOP_EVENTS(enable);
539 
540       if (sctx->gfx_level >= GFX10)
541          spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
542 
543       radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
544    } else {
545       /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
546       radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
547                                        S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
548                                        S_009100_ENABLE_SQG_BOP_EVENTS(enable));
549    }
550    radeon_end();
551 }
552 
553 static uint32_t num_events = 0;
si_sqtt_write_event_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t vertex_offset_user_data,uint32_t instance_offset_user_data,uint32_t draw_index_user_data)554 void si_sqtt_write_event_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
555                                 enum rgp_sqtt_marker_event_type api_type,
556                                 uint32_t vertex_offset_user_data,
557                                 uint32_t instance_offset_user_data,
558                                 uint32_t draw_index_user_data)
559 {
560    struct rgp_sqtt_marker_event marker = {0};
561 
562    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
563    marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
564    marker.cmd_id = num_events++;
565    marker.cb_id = 0;
566 
567    if (vertex_offset_user_data == UINT_MAX ||
568        instance_offset_user_data == UINT_MAX) {
569       vertex_offset_user_data = 0;
570       instance_offset_user_data = 0;
571    }
572 
573    if (draw_index_user_data == UINT_MAX)
574       draw_index_user_data = vertex_offset_user_data;
575 
576    marker.vertex_offset_reg_idx = vertex_offset_user_data;
577    marker.instance_offset_reg_idx = instance_offset_user_data;
578    marker.draw_index_reg_idx = draw_index_user_data;
579 
580    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
581 
582    sctx->sqtt_next_event = EventInvalid;
583 }
584 
si_write_event_with_dims_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t x,uint32_t y,uint32_t z)585 void si_write_event_with_dims_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
586                                      enum rgp_sqtt_marker_event_type api_type,
587                                      uint32_t x, uint32_t y, uint32_t z)
588 {
589    struct rgp_sqtt_marker_event_with_dims marker = {0};
590 
591    marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
592    marker.event.api_type = api_type;
593    marker.event.cmd_id = num_events++;
594    marker.event.cb_id = 0;
595    marker.event.has_thread_dims = 1;
596 
597    marker.thread_x = x;
598    marker.thread_y = y;
599    marker.thread_z = z;
600 
601    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
602    sctx->sqtt_next_event = EventInvalid;
603 }
604 
si_sqtt_describe_barrier_start(struct si_context * sctx,struct radeon_cmdbuf * rcs)605 void si_sqtt_describe_barrier_start(struct si_context *sctx, struct radeon_cmdbuf *rcs)
606 {
607    struct rgp_sqtt_marker_barrier_start marker = {0};
608 
609    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
610    marker.cb_id = 0;
611    marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
612 
613    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
614 }
615 
si_sqtt_describe_barrier_end(struct si_context * sctx,struct radeon_cmdbuf * rcs,unsigned flags)616 void si_sqtt_describe_barrier_end(struct si_context *sctx, struct radeon_cmdbuf *rcs,
617                                   unsigned flags)
618 {
619    struct rgp_sqtt_marker_barrier_end marker = {0};
620 
621    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
622    marker.cb_id = 0;
623 
624    if (flags & SI_BARRIER_SYNC_VS)
625       marker.vs_partial_flush = true;
626    if (flags & SI_BARRIER_SYNC_PS)
627       marker.ps_partial_flush = true;
628    if (flags & SI_BARRIER_SYNC_CS)
629       marker.cs_partial_flush = true;
630 
631    if (flags & SI_BARRIER_PFP_SYNC_ME)
632       marker.pfp_sync_me = true;
633 
634    if (flags & SI_BARRIER_INV_VMEM)
635       marker.inval_tcp = true;
636    if (flags & SI_BARRIER_INV_ICACHE)
637       marker.inval_sqI = true;
638    if (flags & SI_BARRIER_INV_SMEM)
639       marker.inval_sqK = true;
640    if (flags & SI_BARRIER_INV_L2)
641       marker.inval_tcc = true;
642 
643    if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
644       marker.inval_cb = true;
645       marker.flush_cb = true;
646    }
647    if (flags & SI_BARRIER_SYNC_AND_INV_DB) {
648       marker.inval_db = true;
649       marker.flush_db = true;
650    }
651 
652    si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
653 }
654 
si_write_user_event(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_user_event_type type,const char * str,int len)655 void si_write_user_event(struct si_context *sctx, struct radeon_cmdbuf *rcs,
656                          enum rgp_sqtt_marker_user_event_type type,
657                          const char *str, int len)
658 {
659    if (type == UserEventPop) {
660       assert(str == NULL);
661       struct rgp_sqtt_marker_user_event marker = {0};
662       marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
663       marker.data_type = type;
664 
665       si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
666    } else {
667       assert(str != NULL);
668       struct rgp_sqtt_marker_user_event_with_length marker = {0};
669       marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
670       marker.user_event.data_type = type;
671       len = MIN2(1024, len);
672       marker.length = align(len, 4);
673 
674       uint8_t *buffer = alloca(sizeof(marker) + marker.length);
675       memcpy(buffer, &marker, sizeof(marker));
676       memcpy(buffer + sizeof(marker), str, len);
677       buffer[sizeof(marker) + len - 1] = '\0';
678 
679       si_emit_sqtt_userdata(sctx, rcs, buffer,
680                             sizeof(marker) / 4 + marker.length / 4);
681    }
682 }
683 
si_sqtt_pipeline_is_registered(struct ac_sqtt * sqtt,uint64_t pipeline_hash)684 bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt,
685                                     uint64_t pipeline_hash)
686 {
687    simple_mtx_lock(&sqtt->rgp_pso_correlation.lock);
688    list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
689                              &sqtt->rgp_pso_correlation.record, list) {
690       if (record->pipeline_hash[0] == pipeline_hash) {
691          simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
692          return true;
693       }
694    }
695    simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
696 
697    return false;
698 }
699 
700 static enum rgp_hardware_stages
si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key * key,enum pipe_shader_type stage)701 si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key *key, enum pipe_shader_type stage)
702 {
703    switch (stage) {
704       case PIPE_SHADER_VERTEX:
705          if (key->ge.as_ls)
706             return RGP_HW_STAGE_LS;
707          else if (key->ge.as_es)
708             return RGP_HW_STAGE_ES;
709          else if (key->ge.as_ngg)
710             return RGP_HW_STAGE_GS;
711          else
712             return RGP_HW_STAGE_VS;
713       case PIPE_SHADER_TESS_CTRL:
714          return RGP_HW_STAGE_HS;
715       case PIPE_SHADER_TESS_EVAL:
716          if (key->ge.as_es)
717             return RGP_HW_STAGE_ES;
718          else if (key->ge.as_ngg)
719             return RGP_HW_STAGE_GS;
720          else
721             return RGP_HW_STAGE_VS;
722       case PIPE_SHADER_GEOMETRY:
723          return RGP_HW_STAGE_GS;
724       case PIPE_SHADER_FRAGMENT:
725          return RGP_HW_STAGE_PS;
726       case PIPE_SHADER_COMPUTE:
727          return RGP_HW_STAGE_CS;
728       default:
729          unreachable("invalid mesa shader stage");
730    }
731 }
732 
733 static bool
si_sqtt_add_code_object(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,uint32_t * gfx_sh_offsets)734 si_sqtt_add_code_object(struct si_context *sctx,
735                         struct si_sqtt_fake_pipeline *pipeline,
736                         uint32_t *gfx_sh_offsets)
737 {
738    struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
739    struct rgp_code_object_record *record;
740    bool is_compute = gfx_sh_offsets == NULL;
741 
742    record = calloc(1, sizeof(struct rgp_code_object_record));
743    if (!record)
744       return false;
745 
746    record->shader_stages_mask = 0;
747    record->num_shaders_combined = 0;
748    record->pipeline_hash[0] = pipeline->code_hash;
749    record->pipeline_hash[1] = pipeline->code_hash;
750 
751    for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
752       struct si_shader *shader;
753       enum rgp_hardware_stages hw_stage;
754 
755       if (is_compute) {
756          if (i != PIPE_SHADER_COMPUTE)
757             continue;
758          shader = &sctx->cs_shader_state.program->shader;
759          hw_stage = RGP_HW_STAGE_CS;
760       } else if (i <= PIPE_SHADER_FRAGMENT) {
761          if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
762             continue;
763          shader = sctx->shaders[i].current;
764          hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
765       } else {
766          continue;
767       }
768 
769       uint8_t *code = malloc(shader->binary.uploaded_code_size);
770       if (!code) {
771          free(record);
772          return false;
773       }
774       memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
775 
776       uint64_t va = pipeline->bo->gpu_address + (is_compute ? 0 : gfx_sh_offsets[i]);
777       unsigned lds_increment = sctx->gfx_level >= GFX11 && i == MESA_SHADER_FRAGMENT ?
778          1024 : sctx->screen->info.lds_encode_granularity;
779 
780       memset(record->shader_data[i].rt_shader_name, 0, sizeof(record->shader_data[i].rt_shader_name));
781       record->shader_data[i].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
782       record->shader_data[i].hash[1] = record->shader_data[i].hash[0];
783       record->shader_data[i].code_size = shader->binary.uploaded_code_size;
784       record->shader_data[i].code = code;
785       record->shader_data[i].vgpr_count = shader->config.num_vgprs;
786       record->shader_data[i].sgpr_count = shader->config.num_sgprs;
787       record->shader_data[i].base_address = va & 0xffffffffffff;
788       record->shader_data[i].elf_symbol_offset = 0;
789       record->shader_data[i].hw_stage = hw_stage;
790       record->shader_data[i].is_combined = false;
791       record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
792       record->shader_data[i].lds_size = shader->config.lds_size * lds_increment;
793       record->shader_data[i].wavefront_size = shader->wave_size;
794 
795       record->shader_stages_mask |= 1 << i;
796       record->num_shaders_combined++;
797    }
798 
799    simple_mtx_lock(&code_object->lock);
800    list_addtail(&record->list, &code_object->record);
801    code_object->record_count++;
802    simple_mtx_unlock(&code_object->lock);
803 
804    return true;
805 }
806 
si_sqtt_register_pipeline(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,uint32_t * gfx_sh_offsets)807 bool si_sqtt_register_pipeline(struct si_context *sctx, struct si_sqtt_fake_pipeline *pipeline,
808                                uint32_t *gfx_sh_offsets)
809 {
810    assert(!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline->code_hash));
811 
812    bool result = ac_sqtt_add_pso_correlation(sctx->sqtt, pipeline->code_hash, pipeline->code_hash);
813    if (!result)
814       return false;
815 
816    result = ac_sqtt_add_code_object_loader_event(
817       sctx->sqtt, pipeline->code_hash, pipeline->bo->gpu_address);
818    if (!result)
819       return false;
820 
821    return si_sqtt_add_code_object(sctx, pipeline, gfx_sh_offsets);
822 }
823 
si_sqtt_describe_pipeline_bind(struct si_context * sctx,uint64_t pipeline_hash,int bind_point)824 void si_sqtt_describe_pipeline_bind(struct si_context *sctx,
825                                     uint64_t pipeline_hash,
826                                     int bind_point)
827 {
828    struct rgp_sqtt_marker_pipeline_bind marker = {0};
829    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
830 
831    if (likely(!sctx->sqtt_enabled)) {
832       return;
833    }
834 
835    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
836    marker.cb_id = 0;
837    marker.bind_point = bind_point;
838    marker.api_pso_hash[0] = pipeline_hash;
839    marker.api_pso_hash[1] = pipeline_hash >> 32;
840 
841    si_emit_sqtt_userdata(sctx, cs, &marker, sizeof(marker) / 4);
842 }
843