1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "amd_family.h"
8 #include "si_build_pm4.h"
9 #include "si_pipe.h"
10
11 #include "tgsi/tgsi_from_mesa.h"
12 #include "util/hash_table.h"
13 #include "util/u_debug.h"
14 #include "util/u_memory.h"
15 #include "ac_rgp.h"
16 #include "ac_sqtt.h"
17
18 static void
19 si_emit_spi_config_cntl(struct si_context *sctx,
20 struct radeon_cmdbuf *cs, bool enable);
21
si_sqtt_init_bo(struct si_context * sctx)22 static bool si_sqtt_init_bo(struct si_context *sctx)
23 {
24 const uint32_t align_shift = ac_sqtt_get_buffer_align_shift(&sctx->screen->info);
25 unsigned max_se = sctx->screen->info.max_se;
26 struct radeon_winsys *ws = sctx->ws;
27 uint64_t size;
28
29 /* The buffer size and address need to be aligned in HW regs. Align the
30 * size as early as possible so that we do all the allocation & addressing
31 * correctly. */
32 sctx->sqtt->buffer_size =
33 align64(sctx->sqtt->buffer_size, 1ull << align_shift);
34
35 /* Compute total size of the thread trace BO for all SEs. */
36 size = align64(sizeof(struct ac_sqtt_data_info) * max_se,
37 1ull << align_shift);
38 size += sctx->sqtt->buffer_size * (uint64_t)max_se;
39
40 sctx->sqtt->bo =
41 ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_GTT,
42 RADEON_FLAG_NO_INTERPROCESS_SHARING |
43 RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_SUBALLOC);
44 if (!sctx->sqtt->bo)
45 return false;
46
47 sctx->sqtt->buffer_va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
48
49 return true;
50 }
51
si_emit_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)52 static void si_emit_sqtt_start(struct si_context *sctx,
53 struct radeon_cmdbuf *cs,
54 enum amd_ip_type ip_type)
55 {
56 struct si_screen *sscreen = sctx->screen;
57 const bool is_compute_queue = ip_type == AMD_IP_COMPUTE;
58 struct ac_pm4_state *pm4;
59
60 pm4 = ac_pm4_create_sized(&sscreen->info, false, 512, is_compute_queue);
61 if (!pm4)
62 return;
63
64 ac_sqtt_emit_start(&sscreen->info, pm4, sctx->sqtt, is_compute_queue);
65 ac_pm4_finalize(pm4);
66
67 radeon_begin(cs);
68 radeon_emit_array(pm4->pm4, pm4->ndw);
69 radeon_end();
70
71 ac_pm4_free_state(pm4);
72 }
73
si_emit_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs,enum amd_ip_type ip_type)74 static void si_emit_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs,
75 enum amd_ip_type ip_type)
76 {
77 struct si_screen *sscreen = sctx->screen;
78 const bool is_compute_queue = ip_type == AMD_IP_COMPUTE;
79 struct ac_pm4_state *pm4;
80
81 pm4 = ac_pm4_create_sized(&sscreen->info, false, 512, is_compute_queue);
82 if (!pm4)
83 return;
84
85 ac_sqtt_emit_stop(&sscreen->info, pm4, is_compute_queue);
86 ac_pm4_finalize(pm4);
87
88 radeon_begin(cs);
89 radeon_emit_array(pm4->pm4, pm4->ndw);
90 radeon_end();
91
92 ac_pm4_clear_state(pm4, &sscreen->info, false, is_compute_queue);
93
94 if (sctx->screen->info.has_sqtt_rb_harvest_bug) {
95 /* Some chips with disabled RBs should wait for idle because FINISH_DONE
96 * doesn't work. */
97 sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_SYNC_AND_INV_DB |
98 SI_BARRIER_SYNC_CS;
99 sctx->emit_barrier(sctx, cs);
100 }
101
102 ac_sqtt_emit_wait(&sscreen->info, pm4, sctx->sqtt, is_compute_queue);
103 ac_pm4_finalize(pm4);
104
105 radeon_begin_again(cs);
106 radeon_emit_array(pm4->pm4, pm4->ndw);
107 radeon_end();
108
109 ac_pm4_free_state(pm4);
110 }
111
si_sqtt_start(struct si_context * sctx,struct radeon_cmdbuf * cs)112 static void si_sqtt_start(struct si_context *sctx, struct radeon_cmdbuf *cs)
113 {
114 struct radeon_winsys *ws = sctx->ws;
115 enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
116
117 radeon_begin(cs);
118
119 switch (ip_type) {
120 case AMD_IP_GFX:
121 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
122 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
123 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
124 break;
125 case AMD_IP_COMPUTE:
126 radeon_emit(PKT3(PKT3_NOP, 0, 0));
127 radeon_emit(0);
128 break;
129 default:
130 /* Unsupported. */
131 assert(false);
132 }
133 radeon_end();
134
135 ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
136 RADEON_DOMAIN_VRAM);
137 if (sctx->spm.bo)
138 ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
139 RADEON_DOMAIN_VRAM);
140
141 si_cp_dma_wait_for_idle(sctx, cs);
142
143 /* Make sure to wait-for-idle before starting SQTT. */
144 sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS |
145 SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM |
146 SI_BARRIER_INV_VMEM | SI_BARRIER_INV_L2 |
147 SI_BARRIER_PFP_SYNC_ME;
148 sctx->emit_barrier(sctx, cs);
149
150 si_inhibit_clockgating(sctx, cs, true);
151
152 /* Enable SQG events that collects thread trace data. */
153 si_emit_spi_config_cntl(sctx, cs, true);
154
155 if (sctx->spm.bo) {
156 si_pc_emit_spm_reset(cs);
157 si_pc_emit_shaders(cs, ac_sqtt_get_shader_mask(&sctx->screen->info));
158 si_emit_spm_setup(sctx, cs);
159 }
160
161 si_emit_sqtt_start(sctx, cs, ip_type);
162
163 if (sctx->spm.bo)
164 si_pc_emit_spm_start(cs);
165 }
166
si_sqtt_stop(struct si_context * sctx,struct radeon_cmdbuf * cs)167 static void si_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs)
168 {
169 struct radeon_winsys *ws = sctx->ws;
170 enum amd_ip_type ip_type = sctx->ws->cs_get_ip_type(cs);
171
172 radeon_begin(cs);
173
174 switch (ip_type) {
175 case AMD_IP_GFX:
176 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
177 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
178 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
179 break;
180 case AMD_IP_COMPUTE:
181 radeon_emit(PKT3(PKT3_NOP, 0, 0));
182 radeon_emit(0);
183 break;
184 default:
185 /* Unsupported. */
186 assert(false);
187 }
188 radeon_end();
189
190 ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
191 RADEON_DOMAIN_VRAM);
192
193 if (sctx->spm.bo)
194 ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
195 RADEON_DOMAIN_VRAM);
196
197 si_cp_dma_wait_for_idle(sctx, cs);
198
199 if (sctx->spm.bo)
200 si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters,
201 sctx->screen->info.never_send_perfcounter_stop);
202
203 /* Make sure to wait-for-idle before stopping SQTT. */
204 sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS |
205 SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_SMEM |
206 SI_BARRIER_INV_VMEM | SI_BARRIER_INV_L2 |
207 SI_BARRIER_PFP_SYNC_ME;
208 sctx->emit_barrier(sctx, cs);
209
210 si_emit_sqtt_stop(sctx, cs, ip_type);
211
212 if (sctx->spm.bo)
213 si_pc_emit_spm_reset(cs);
214
215 /* Restore previous state by disabling SQG events. */
216 si_emit_spi_config_cntl(sctx, cs, false);
217
218 si_inhibit_clockgating(sctx, cs, false);
219 }
220
si_sqtt_init_cs(struct si_context * sctx)221 static void si_sqtt_init_cs(struct si_context *sctx)
222 {
223 struct radeon_winsys *ws = sctx->ws;
224
225 for (unsigned i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
226 sctx->sqtt->start_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
227 if (!ws->cs_create(sctx->sqtt->start_cs[i], sctx->ctx, (enum amd_ip_type)i,
228 NULL, NULL)) {
229 free(sctx->sqtt->start_cs[i]);
230 sctx->sqtt->start_cs[i] = NULL;
231 return;
232 }
233 si_sqtt_start(sctx, sctx->sqtt->start_cs[i]);
234
235 sctx->sqtt->stop_cs[i] = CALLOC_STRUCT(radeon_cmdbuf);
236 if (!ws->cs_create(sctx->sqtt->stop_cs[i], sctx->ctx, (enum amd_ip_type)i,
237 NULL, NULL)) {
238 ws->cs_destroy(sctx->sqtt->start_cs[i]);
239 free(sctx->sqtt->start_cs[i]);
240 sctx->sqtt->start_cs[i] = NULL;
241 free(sctx->sqtt->stop_cs[i]);
242 sctx->sqtt->stop_cs[i] = NULL;
243 return;
244 }
245
246 si_sqtt_stop(sctx, sctx->sqtt->stop_cs[i]);
247 }
248 }
249
si_begin_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)250 static void si_begin_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
251 {
252 struct radeon_cmdbuf *cs = sctx->sqtt->start_cs[sctx->ws->cs_get_ip_type(rcs)];
253 sctx->ws->cs_flush(cs, 0, NULL);
254 }
255
si_end_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)256 static void si_end_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
257 {
258 struct radeon_cmdbuf *cs = sctx->sqtt->stop_cs[sctx->ws->cs_get_ip_type(rcs)];
259 sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
260 }
261
262 static bool
si_sqtt_resize_bo(struct si_context * sctx)263 si_sqtt_resize_bo(struct si_context *sctx)
264 {
265 /* Destroy the previous thread trace BO. */
266 struct pb_buffer_lean *bo = sctx->sqtt->bo;
267 radeon_bo_reference(sctx->screen->ws, &bo, NULL);
268
269 /* Double the size of the thread trace buffer per SE. */
270 sctx->sqtt->buffer_size *= 2;
271
272 fprintf(stderr,
273 "Failed to get the thread trace because the buffer "
274 "was too small, resizing to %d KB\n",
275 sctx->sqtt->buffer_size / 1024);
276
277 /* Re-create the thread trace BO. */
278 return si_sqtt_init_bo(sctx);
279 }
280
si_get_sqtt_trace(struct si_context * sctx,struct ac_sqtt_trace * sqtt)281 static bool si_get_sqtt_trace(struct si_context *sctx,
282 struct ac_sqtt_trace *sqtt)
283 {
284 memset(sqtt, 0, sizeof(*sqtt));
285
286 sctx->sqtt->ptr =
287 sctx->ws->buffer_map(sctx->ws, sctx->sqtt->bo, NULL, PIPE_MAP_READ);
288
289 if (!sctx->sqtt->ptr)
290 return false;
291
292 if (!ac_sqtt_get_trace(sctx->sqtt, &sctx->screen->info, sqtt)) {
293 if (!si_sqtt_resize_bo(sctx)) {
294 fprintf(stderr, "radeonsi: Failed to resize the SQTT buffer.\n");
295 } else {
296 for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
297 sctx->screen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
298 sctx->screen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
299 }
300 si_sqtt_init_cs(sctx);
301 }
302 return false;
303 }
304 return true;
305 }
306
si_init_sqtt(struct si_context * sctx)307 bool si_init_sqtt(struct si_context *sctx)
308 {
309 static bool warn_once = true;
310 if (warn_once) {
311 fprintf(stderr, "*************************************************\n");
312 fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
313 fprintf(stderr, "*************************************************\n");
314 warn_once = false;
315 }
316
317 sctx->sqtt = CALLOC_STRUCT(ac_sqtt);
318
319 if (sctx->gfx_level < GFX8) {
320 fprintf(stderr, "GPU hardware not supported: refer to "
321 "the RGP documentation for the list of "
322 "supported GPUs!\n");
323 return false;
324 }
325
326 if (sctx->gfx_level > GFX11) {
327 fprintf(stderr, "radeonsi: Thread trace is not supported "
328 "for that GPU!\n");
329 return false;
330 }
331
332 /* Default buffer size set to 32MB per SE. */
333 sctx->sqtt->buffer_size =
334 debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
335 sctx->sqtt->instruction_timing_enabled =
336 debug_get_bool_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING", true);
337 sctx->sqtt->start_frame = 10;
338
339 const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
340 if (trigger) {
341 sctx->sqtt->start_frame = atoi(trigger);
342 if (sctx->sqtt->start_frame <= 0) {
343 /* This isn't a frame number, must be a file */
344 sctx->sqtt->trigger_file = strdup(trigger);
345 sctx->sqtt->start_frame = -1;
346 }
347 }
348
349 if (!si_sqtt_init_bo(sctx))
350 return false;
351
352 sctx->sqtt->pipeline_bos = _mesa_hash_table_u64_create(NULL);
353
354 ac_sqtt_init(sctx->sqtt);
355
356 if (sctx->gfx_level >= GFX10 &&
357 debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) {
358 /* Limit SPM counters to GFX10 and GFX10_3 for now */
359 ASSERTED bool r = si_spm_init(sctx);
360 assert(r);
361 }
362
363 si_sqtt_init_cs(sctx);
364
365 sctx->sqtt_next_event = EventInvalid;
366
367 return true;
368 }
369
si_destroy_sqtt(struct si_context * sctx)370 void si_destroy_sqtt(struct si_context *sctx)
371 {
372 struct si_screen *sscreen = sctx->screen;
373 struct pb_buffer_lean *bo = sctx->sqtt->bo;
374 radeon_bo_reference(sctx->screen->ws, &bo, NULL);
375
376 if (sctx->sqtt->trigger_file)
377 free(sctx->sqtt->trigger_file);
378
379 for (int i = 0; i < ARRAY_SIZE(sctx->sqtt->start_cs); i++) {
380 sscreen->ws->cs_destroy(sctx->sqtt->start_cs[i]);
381 sscreen->ws->cs_destroy(sctx->sqtt->stop_cs[i]);
382 }
383
384 struct rgp_pso_correlation *pso_correlation =
385 &sctx->sqtt->rgp_pso_correlation;
386 struct rgp_loader_events *loader_events = &sctx->sqtt->rgp_loader_events;
387 struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
388 list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
389 &pso_correlation->record, list) {
390 list_del(&record->list);
391 pso_correlation->record_count--;
392 free(record);
393 }
394
395 list_for_each_entry_safe (struct rgp_loader_events_record, record,
396 &loader_events->record, list) {
397 list_del(&record->list);
398 loader_events->record_count--;
399 free(record);
400 }
401
402 list_for_each_entry_safe (struct rgp_code_object_record, record,
403 &code_object->record, list) {
404 uint32_t mask = record->shader_stages_mask;
405 int i;
406
407 /* Free the disassembly. */
408 while (mask) {
409 i = u_bit_scan(&mask);
410 free(record->shader_data[i].code);
411 }
412 list_del(&record->list);
413 free(record);
414 code_object->record_count--;
415 }
416
417 ac_sqtt_finish(sctx->sqtt);
418
419 hash_table_foreach (sctx->sqtt->pipeline_bos->table, entry) {
420 struct si_sqtt_fake_pipeline *pipeline =
421 (struct si_sqtt_fake_pipeline *)entry->data;
422 si_resource_reference(&pipeline->bo, NULL);
423 FREE(pipeline);
424 }
425
426 free(sctx->sqtt);
427 sctx->sqtt = NULL;
428
429 if (sctx->spm.bo)
430 si_spm_finish(sctx);
431 }
432
433 static uint64_t num_frames = 0;
434
si_handle_sqtt(struct si_context * sctx,struct radeon_cmdbuf * rcs)435 void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs)
436 {
437 /* Should we enable SQTT yet? */
438 if (!sctx->sqtt_enabled) {
439 bool frame_trigger = num_frames == sctx->sqtt->start_frame;
440 bool file_trigger = false;
441 if (sctx->sqtt->trigger_file &&
442 access(sctx->sqtt->trigger_file, W_OK) == 0) {
443 if (unlink(sctx->sqtt->trigger_file) == 0) {
444 file_trigger = true;
445 } else {
446 /* Do not enable tracing if we cannot remove the file,
447 * because by then we'll trace every frame.
448 */
449 fprintf(stderr, "radeonsi: could not remove thread "
450 "trace trigger file, ignoring\n");
451 }
452 }
453
454 if (frame_trigger || file_trigger) {
455 /* Wait for last submission */
456 sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence,
457 OS_TIMEOUT_INFINITE);
458
459 /* Start SQTT */
460 si_begin_sqtt(sctx, rcs);
461
462 sctx->sqtt_enabled = true;
463 sctx->sqtt->start_frame = -1;
464
465 /* Force shader update to make sure si_sqtt_describe_pipeline_bind is
466 * called for the current "pipeline".
467 */
468 sctx->do_update_shaders = true;
469 }
470 } else {
471 struct ac_sqtt_trace sqtt_trace = {0};
472
473 /* Stop SQTT */
474 si_end_sqtt(sctx, rcs);
475 sctx->sqtt_enabled = false;
476 sctx->sqtt->start_frame = -1;
477 assert(sctx->last_sqtt_fence);
478
479 /* Wait for SQTT to finish and read back the bo */
480 if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence,
481 OS_TIMEOUT_INFINITE) &&
482 si_get_sqtt_trace(sctx, &sqtt_trace)) {
483 struct ac_spm_trace spm_trace;
484
485 /* Map the SPM counter buffer */
486 if (sctx->spm.bo) {
487 sctx->spm.ptr = sctx->ws->buffer_map(
488 sctx->ws, sctx->spm.bo, NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY);
489 ac_spm_get_trace(&sctx->spm, &spm_trace);
490 }
491
492 ac_dump_rgp_capture(&sctx->screen->info, &sqtt_trace,
493 sctx->spm.bo ? &spm_trace : NULL);
494
495 if (sctx->spm.ptr)
496 sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo);
497 } else {
498 fprintf(stderr, "Failed to read the trace\n");
499 if (!sctx->sqtt->trigger_file) {
500 sctx->sqtt->start_frame = num_frames + 10;
501 }
502 }
503 }
504
505 num_frames++;
506 }
507
si_emit_sqtt_userdata(struct si_context * sctx,struct radeon_cmdbuf * cs,const void * data,uint32_t num_dwords)508 static void si_emit_sqtt_userdata(struct si_context *sctx,
509 struct radeon_cmdbuf *cs, const void *data,
510 uint32_t num_dwords)
511 {
512 const uint32_t *dwords = (uint32_t *)data;
513
514 radeon_begin(cs);
515
516 while (num_dwords > 0) {
517 uint32_t count = MIN2(num_dwords, 2);
518
519 radeon_set_uconfig_perfctr_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
520 radeon_emit_array(dwords, count);
521
522 dwords += count;
523 num_dwords -= count;
524 }
525 radeon_end();
526 }
527
528 static void
si_emit_spi_config_cntl(struct si_context * sctx,struct radeon_cmdbuf * cs,bool enable)529 si_emit_spi_config_cntl(struct si_context *sctx,
530 struct radeon_cmdbuf *cs, bool enable)
531 {
532 radeon_begin(cs);
533
534 if (sctx->gfx_level >= GFX9) {
535 uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
536 S_031100_EXP_PRIORITY_ORDER(3) |
537 S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
538 S_031100_ENABLE_SQG_BOP_EVENTS(enable);
539
540 if (sctx->gfx_level >= GFX10)
541 spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
542
543 radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
544 } else {
545 /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
546 radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
547 S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
548 S_009100_ENABLE_SQG_BOP_EVENTS(enable));
549 }
550 radeon_end();
551 }
552
553 static uint32_t num_events = 0;
si_sqtt_write_event_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t vertex_offset_user_data,uint32_t instance_offset_user_data,uint32_t draw_index_user_data)554 void si_sqtt_write_event_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
555 enum rgp_sqtt_marker_event_type api_type,
556 uint32_t vertex_offset_user_data,
557 uint32_t instance_offset_user_data,
558 uint32_t draw_index_user_data)
559 {
560 struct rgp_sqtt_marker_event marker = {0};
561
562 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
563 marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
564 marker.cmd_id = num_events++;
565 marker.cb_id = 0;
566
567 if (vertex_offset_user_data == UINT_MAX ||
568 instance_offset_user_data == UINT_MAX) {
569 vertex_offset_user_data = 0;
570 instance_offset_user_data = 0;
571 }
572
573 if (draw_index_user_data == UINT_MAX)
574 draw_index_user_data = vertex_offset_user_data;
575
576 marker.vertex_offset_reg_idx = vertex_offset_user_data;
577 marker.instance_offset_reg_idx = instance_offset_user_data;
578 marker.draw_index_reg_idx = draw_index_user_data;
579
580 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
581
582 sctx->sqtt_next_event = EventInvalid;
583 }
584
si_write_event_with_dims_marker(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_event_type api_type,uint32_t x,uint32_t y,uint32_t z)585 void si_write_event_with_dims_marker(struct si_context *sctx, struct radeon_cmdbuf *rcs,
586 enum rgp_sqtt_marker_event_type api_type,
587 uint32_t x, uint32_t y, uint32_t z)
588 {
589 struct rgp_sqtt_marker_event_with_dims marker = {0};
590
591 marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
592 marker.event.api_type = api_type;
593 marker.event.cmd_id = num_events++;
594 marker.event.cb_id = 0;
595 marker.event.has_thread_dims = 1;
596
597 marker.thread_x = x;
598 marker.thread_y = y;
599 marker.thread_z = z;
600
601 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
602 sctx->sqtt_next_event = EventInvalid;
603 }
604
si_sqtt_describe_barrier_start(struct si_context * sctx,struct radeon_cmdbuf * rcs)605 void si_sqtt_describe_barrier_start(struct si_context *sctx, struct radeon_cmdbuf *rcs)
606 {
607 struct rgp_sqtt_marker_barrier_start marker = {0};
608
609 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
610 marker.cb_id = 0;
611 marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
612
613 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
614 }
615
si_sqtt_describe_barrier_end(struct si_context * sctx,struct radeon_cmdbuf * rcs,unsigned flags)616 void si_sqtt_describe_barrier_end(struct si_context *sctx, struct radeon_cmdbuf *rcs,
617 unsigned flags)
618 {
619 struct rgp_sqtt_marker_barrier_end marker = {0};
620
621 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
622 marker.cb_id = 0;
623
624 if (flags & SI_BARRIER_SYNC_VS)
625 marker.vs_partial_flush = true;
626 if (flags & SI_BARRIER_SYNC_PS)
627 marker.ps_partial_flush = true;
628 if (flags & SI_BARRIER_SYNC_CS)
629 marker.cs_partial_flush = true;
630
631 if (flags & SI_BARRIER_PFP_SYNC_ME)
632 marker.pfp_sync_me = true;
633
634 if (flags & SI_BARRIER_INV_VMEM)
635 marker.inval_tcp = true;
636 if (flags & SI_BARRIER_INV_ICACHE)
637 marker.inval_sqI = true;
638 if (flags & SI_BARRIER_INV_SMEM)
639 marker.inval_sqK = true;
640 if (flags & SI_BARRIER_INV_L2)
641 marker.inval_tcc = true;
642
643 if (flags & SI_BARRIER_SYNC_AND_INV_CB) {
644 marker.inval_cb = true;
645 marker.flush_cb = true;
646 }
647 if (flags & SI_BARRIER_SYNC_AND_INV_DB) {
648 marker.inval_db = true;
649 marker.flush_db = true;
650 }
651
652 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
653 }
654
si_write_user_event(struct si_context * sctx,struct radeon_cmdbuf * rcs,enum rgp_sqtt_marker_user_event_type type,const char * str,int len)655 void si_write_user_event(struct si_context *sctx, struct radeon_cmdbuf *rcs,
656 enum rgp_sqtt_marker_user_event_type type,
657 const char *str, int len)
658 {
659 if (type == UserEventPop) {
660 assert(str == NULL);
661 struct rgp_sqtt_marker_user_event marker = {0};
662 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
663 marker.data_type = type;
664
665 si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
666 } else {
667 assert(str != NULL);
668 struct rgp_sqtt_marker_user_event_with_length marker = {0};
669 marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
670 marker.user_event.data_type = type;
671 len = MIN2(1024, len);
672 marker.length = align(len, 4);
673
674 uint8_t *buffer = alloca(sizeof(marker) + marker.length);
675 memcpy(buffer, &marker, sizeof(marker));
676 memcpy(buffer + sizeof(marker), str, len);
677 buffer[sizeof(marker) + len - 1] = '\0';
678
679 si_emit_sqtt_userdata(sctx, rcs, buffer,
680 sizeof(marker) / 4 + marker.length / 4);
681 }
682 }
683
si_sqtt_pipeline_is_registered(struct ac_sqtt * sqtt,uint64_t pipeline_hash)684 bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt,
685 uint64_t pipeline_hash)
686 {
687 simple_mtx_lock(&sqtt->rgp_pso_correlation.lock);
688 list_for_each_entry_safe (struct rgp_pso_correlation_record, record,
689 &sqtt->rgp_pso_correlation.record, list) {
690 if (record->pipeline_hash[0] == pipeline_hash) {
691 simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
692 return true;
693 }
694 }
695 simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
696
697 return false;
698 }
699
700 static enum rgp_hardware_stages
si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key * key,enum pipe_shader_type stage)701 si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key *key, enum pipe_shader_type stage)
702 {
703 switch (stage) {
704 case PIPE_SHADER_VERTEX:
705 if (key->ge.as_ls)
706 return RGP_HW_STAGE_LS;
707 else if (key->ge.as_es)
708 return RGP_HW_STAGE_ES;
709 else if (key->ge.as_ngg)
710 return RGP_HW_STAGE_GS;
711 else
712 return RGP_HW_STAGE_VS;
713 case PIPE_SHADER_TESS_CTRL:
714 return RGP_HW_STAGE_HS;
715 case PIPE_SHADER_TESS_EVAL:
716 if (key->ge.as_es)
717 return RGP_HW_STAGE_ES;
718 else if (key->ge.as_ngg)
719 return RGP_HW_STAGE_GS;
720 else
721 return RGP_HW_STAGE_VS;
722 case PIPE_SHADER_GEOMETRY:
723 return RGP_HW_STAGE_GS;
724 case PIPE_SHADER_FRAGMENT:
725 return RGP_HW_STAGE_PS;
726 case PIPE_SHADER_COMPUTE:
727 return RGP_HW_STAGE_CS;
728 default:
729 unreachable("invalid mesa shader stage");
730 }
731 }
732
733 static bool
si_sqtt_add_code_object(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,uint32_t * gfx_sh_offsets)734 si_sqtt_add_code_object(struct si_context *sctx,
735 struct si_sqtt_fake_pipeline *pipeline,
736 uint32_t *gfx_sh_offsets)
737 {
738 struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
739 struct rgp_code_object_record *record;
740 bool is_compute = gfx_sh_offsets == NULL;
741
742 record = calloc(1, sizeof(struct rgp_code_object_record));
743 if (!record)
744 return false;
745
746 record->shader_stages_mask = 0;
747 record->num_shaders_combined = 0;
748 record->pipeline_hash[0] = pipeline->code_hash;
749 record->pipeline_hash[1] = pipeline->code_hash;
750
751 for (unsigned i = 0; i < MESA_VULKAN_SHADER_STAGES; i++) {
752 struct si_shader *shader;
753 enum rgp_hardware_stages hw_stage;
754
755 if (is_compute) {
756 if (i != PIPE_SHADER_COMPUTE)
757 continue;
758 shader = &sctx->cs_shader_state.program->shader;
759 hw_stage = RGP_HW_STAGE_CS;
760 } else if (i <= PIPE_SHADER_FRAGMENT) {
761 if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
762 continue;
763 shader = sctx->shaders[i].current;
764 hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
765 } else {
766 continue;
767 }
768
769 uint8_t *code = malloc(shader->binary.uploaded_code_size);
770 if (!code) {
771 free(record);
772 return false;
773 }
774 memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
775
776 uint64_t va = pipeline->bo->gpu_address + (is_compute ? 0 : gfx_sh_offsets[i]);
777 unsigned lds_increment = sctx->gfx_level >= GFX11 && i == MESA_SHADER_FRAGMENT ?
778 1024 : sctx->screen->info.lds_encode_granularity;
779
780 memset(record->shader_data[i].rt_shader_name, 0, sizeof(record->shader_data[i].rt_shader_name));
781 record->shader_data[i].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
782 record->shader_data[i].hash[1] = record->shader_data[i].hash[0];
783 record->shader_data[i].code_size = shader->binary.uploaded_code_size;
784 record->shader_data[i].code = code;
785 record->shader_data[i].vgpr_count = shader->config.num_vgprs;
786 record->shader_data[i].sgpr_count = shader->config.num_sgprs;
787 record->shader_data[i].base_address = va & 0xffffffffffff;
788 record->shader_data[i].elf_symbol_offset = 0;
789 record->shader_data[i].hw_stage = hw_stage;
790 record->shader_data[i].is_combined = false;
791 record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
792 record->shader_data[i].lds_size = shader->config.lds_size * lds_increment;
793 record->shader_data[i].wavefront_size = shader->wave_size;
794
795 record->shader_stages_mask |= 1 << i;
796 record->num_shaders_combined++;
797 }
798
799 simple_mtx_lock(&code_object->lock);
800 list_addtail(&record->list, &code_object->record);
801 code_object->record_count++;
802 simple_mtx_unlock(&code_object->lock);
803
804 return true;
805 }
806
si_sqtt_register_pipeline(struct si_context * sctx,struct si_sqtt_fake_pipeline * pipeline,uint32_t * gfx_sh_offsets)807 bool si_sqtt_register_pipeline(struct si_context *sctx, struct si_sqtt_fake_pipeline *pipeline,
808 uint32_t *gfx_sh_offsets)
809 {
810 assert(!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline->code_hash));
811
812 bool result = ac_sqtt_add_pso_correlation(sctx->sqtt, pipeline->code_hash, pipeline->code_hash);
813 if (!result)
814 return false;
815
816 result = ac_sqtt_add_code_object_loader_event(
817 sctx->sqtt, pipeline->code_hash, pipeline->bo->gpu_address);
818 if (!result)
819 return false;
820
821 return si_sqtt_add_code_object(sctx, pipeline, gfx_sh_offsets);
822 }
823
si_sqtt_describe_pipeline_bind(struct si_context * sctx,uint64_t pipeline_hash,int bind_point)824 void si_sqtt_describe_pipeline_bind(struct si_context *sctx,
825 uint64_t pipeline_hash,
826 int bind_point)
827 {
828 struct rgp_sqtt_marker_pipeline_bind marker = {0};
829 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
830
831 if (likely(!sctx->sqtt_enabled)) {
832 return;
833 }
834
835 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
836 marker.cb_id = 0;
837 marker.bind_point = bind_point;
838 marker.api_pso_hash[0] = pipeline_hash;
839 marker.api_pso_hash[1] = pipeline_hash >> 32;
840
841 si_emit_sqtt_userdata(sctx, cs, &marker, sizeof(marker) / 4);
842 }
843