1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 #include <stdio.h>
7 #include <stdarg.h>
8 #include <string.h>
9
10 #include "util/hash_table.h"
11 #include "util/u_process.h"
12 #include "util/hash_table.h"
13
14 #include "si_pipe.h"
15 #include "si_perfetto.h"
16 #include "si_tracepoints.h"
17
18 #ifdef HAVE_PERFETTO
19
20 #include "util/perf/u_perfetto.h"
21 #include "util/perf/u_perfetto_renderpass.h"
22
23 #include "si_tracepoints_perfetto.h"
24
25 /* Just naming stages */
26 static const struct {
27 const char *name;
28
29 /* The perfetto UI requires that there is a parent-child relationship
30 * within a row of elements. Which means that all children elements must
31 * end within the lifespan of their parent.
32 *
33 * Some elements like stalls and command buffers follow that relationship,
34 * but not all. This tells us in which UI row the elements should live.
35 */
36 enum si_ds_queue_stage draw_stage;
37 } si_queue_stage_desc[SI_DS_QUEUE_STAGE_N_STAGES] = {
38 /* Order must match the enum! */
39 {
40 "queue",
41 SI_DS_QUEUE_STAGE_QUEUE,
42 },
43 {
44 "compute",
45 SI_DS_QUEUE_STAGE_COMPUTE,
46 },
47 {
48 "draw",
49 SI_DS_QUEUE_STAGE_DRAW,
50 }
51 };
52
53 struct SIRenderpassIncrementalState {
54 bool was_cleared = true;
55 };
56
57 struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits {
58 using IncrementalStateType = SIRenderpassIncrementalState;
59 };
60
61 class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource,
62 SIRenderpassTraits> {
63 };
64
65 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
66 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
67
68 using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory;
69
sync_timestamp(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)70 static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
71 {
72 uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
73 uint64_t gpu_ts;
74
75 struct si_context *sctx = container_of(device, struct si_context, ds);
76 gpu_ts = sctx->screen->b.get_timestamp(&sctx->screen->b);
77
78
79 cpu_ts = perfetto::base::GetBootTimeNs().count();
80
81 if (cpu_ts < device->next_clock_sync_ns)
82 return;
83
84 PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id);
85
86 device->sync_gpu_ts = gpu_ts;
87 device->next_clock_sync_ns = cpu_ts + 1000000000ull;
88 MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::
89 EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
90 }
91
send_descriptors(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)92 static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx,
93 struct si_ds_device *device)
94 {
95 PERFETTO_LOG("Sending renderstage descriptors");
96
97 device->event_id = 0;
98 list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
99 for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) {
100 queue->stages[s].start_ns[0] = 0;
101 }
102 }
103
104 {
105 auto packet = ctx.NewTracePacket();
106
107 packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
108 packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
109 packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
110
111 auto interned_data = packet->set_interned_data();
112
113 {
114 auto desc = interned_data->add_graphics_contexts();
115 desc->set_iid(device->iid);
116 desc->set_pid(getpid());
117 switch (device->api) {
118 case AMD_DS_API_OPENGL:
119 desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::OPEN_GL);
120 break;
121 case AMD_DS_API_VULKAN:
122 desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN);
123 break;
124 default:
125 break;
126 }
127 }
128
129 /* Emit all the IID picked at device/queue creation. */
130 list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
131 for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
132 {
133 /* We put the stage number in there so that all rows are order
134 * by si_ds_queue_stage.
135 */
136 char name[100];
137 snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(),
138 queue->name, s, si_queue_stage_desc[s].name);
139
140 auto desc = interned_data->add_gpu_specifications();
141 desc->set_iid(queue->stages[s].queue_iid);
142 desc->set_name(name);
143 }
144 {
145 auto desc = interned_data->add_gpu_specifications();
146 desc->set_iid(queue->stages[s].stage_iid);
147 desc->set_name(si_queue_stage_desc[s].name);
148 }
149 }
150 }
151 }
152
153 device->next_clock_sync_ns = 0;
154 sync_timestamp(ctx, device);
155 }
156
157 typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *,
158 const void*);
159
begin_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id)160 static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id)
161 {
162 PERFETTO_LOG("begin event called - ts_ns=%" PRIu64, ts_ns);
163 uint32_t level = queue->stages[stage_id].level;
164 /* If we haven't managed to calibrate the alignment between GPU and CPU
165 * timestamps yet, then skip this trace, otherwise perfetto won't know
166 * what to do with it.
167 */
168 if (!queue->device->sync_gpu_ts) {
169 queue->stages[stage_id].start_ns[level] = 0;
170 return;
171 }
172
173 if (level >= (ARRAY_SIZE(queue->stages[stage_id].start_ns) - 1))
174 return;
175
176 queue->stages[stage_id].start_ns[level] = ts_ns;
177 queue->stages[stage_id].level++;
178 }
179
end_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id,uint32_t submission_id,const char * app_event,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)180 static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id,
181 uint32_t submission_id, const char *app_event, const void* payload = nullptr,
182 trace_payload_as_extra_func payload_as_extra = nullptr)
183 {
184 PERFETTO_LOG("end event called - ts_ns=%" PRIu64, ts_ns);
185 struct si_ds_device *device = queue->device;
186
187 /* If we haven't managed to calibrate the alignment between GPU and CPU
188 * timestamps yet, then skip this trace, otherwise perfetto won't know
189 * what to do with it.
190 */
191 if (!device->sync_gpu_ts)
192 return;
193
194 if (queue->stages[stage_id].level == 0)
195 return;
196
197 uint32_t level = --queue->stages[stage_id].level;
198 struct si_ds_stage *stage = &queue->stages[stage_id];
199 uint64_t start_ns = stage->start_ns[level];
200 PERFETTO_LOG("end event called - start_ns=%" PRIu64 " ts_ns=%" PRIu64, start_ns, ts_ns);
201 if (!start_ns || start_ns > ts_ns)
202 return;
203
204 SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
205 if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
206 send_descriptors(tctx, queue->device);
207 state->was_cleared = false;
208 }
209
210 sync_timestamp(tctx, queue->device);
211
212 uint64_t evt_id = device->event_id++;
213
214 /* If this is an application event, we might need to generate a new
215 * stage_iid if not already seen. Otherwise, it's a driver event and we
216 * have use the internal stage_iid.
217 */
218 uint64_t stage_iid = app_event ?
219 tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) :
220 stage->stage_iid;
221
222 auto packet = tctx.NewTracePacket();
223
224 packet->set_timestamp(start_ns);
225 packet->set_timestamp_clock_id(queue->device->gpu_clock_id);
226
227 assert(ts_ns >= start_ns);
228
229 auto event = packet->set_gpu_render_stage_event();
230 event->set_gpu_id(queue->device->gpu_id);
231
232 event->set_hw_queue_iid(stage->queue_iid);
233 event->set_stage_iid(stage_iid);
234 event->set_context(queue->device->iid);
235 event->set_event_id(evt_id);
236 event->set_duration(ts_ns - start_ns);
237 event->set_submission_id(submission_id);
238
239 if (payload && payload_as_extra) {
240 payload_as_extra(event, payload);
241 }
242 });
243
244 stage->start_ns[level] = 0;
245 }
246
247 #endif /* HAVE_PERFETTO */
248
249 #ifdef __cplusplus
250 extern "C" {
251 #endif
252
253 #ifdef HAVE_PERFETTO
254
255 /*
256 * Trace callbacks, called from u_trace once the timestamps from GPU have been
257 * collected.
258 */
259
260 #define CREATE_DUAL_EVENT_CALLBACK(event_name, stage) \
261 void si_ds_begin_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \
262 const void *flush_data, \
263 const struct trace_si_begin_##event_name *payload) \
264 { \
265 const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \
266 begin_event(flush->queue, ts_ns, stage); \
267 } \
268 \
269 void si_ds_end_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \
270 const void *flush_data, \
271 const struct trace_si_end_##event_name *payload) \
272 { \
273 const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \
274 end_event(flush->queue, ts_ns, stage, flush->submission_id, NULL, payload, \
275 (trace_payload_as_extra_func)&trace_payload_as_extra_si_end_##event_name); \
276 } \
277
CREATE_DUAL_EVENT_CALLBACK(draw,SI_DS_QUEUE_STAGE_DRAW)278 CREATE_DUAL_EVENT_CALLBACK(draw, SI_DS_QUEUE_STAGE_DRAW)
279 CREATE_DUAL_EVENT_CALLBACK(compute, SI_DS_QUEUE_STAGE_COMPUTE)
280
281 uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
282 {
283 return perfetto::base::GetBootTimeNs().count();
284 }
285
si_ds_end_submit(struct si_ds_queue * queue,uint64_t start_ts)286 void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
287 {
288 if (!u_trace_should_process(&queue->device->trace_context)) {
289 queue->device->sync_gpu_ts = 0;
290 queue->device->next_clock_sync_ns = 0;
291 return;
292 }
293
294 uint64_t end_ts = perfetto::base::GetBootTimeNs().count();
295 uint32_t submission_id = queue->submission_id++;
296
297 SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
298 if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
299 send_descriptors(tctx, queue->device);
300 state->was_cleared = false;
301 }
302
303 sync_timestamp(tctx, queue->device);
304
305 auto packet = tctx.NewTracePacket();
306
307 packet->set_timestamp(start_ts);
308
309 auto event = packet->set_vulkan_api_event();
310 auto submit = event->set_vk_queue_submit();
311
312 submit->set_duration_ns(end_ts - start_ts);
313 submit->set_vk_queue((uintptr_t) queue);
314 submit->set_submission_id(submission_id);
315 });
316 }
317
318 #endif /* HAVE_PERFETTO */
319
si_driver_ds_init_once(void)320 static void si_driver_ds_init_once(void)
321 {
322 #ifdef HAVE_PERFETTO
323 util_perfetto_init();
324 perfetto::DataSourceDescriptor dsd;
325 dsd.set_name("gpu.renderstages.amd");
326 SIRenderpassDataSource::Register(dsd);
327 #endif
328 }
329
330 static once_flag si_driver_ds_once_flag = ONCE_FLAG_INIT;
331 static uint64_t iid = 1;
332
get_iid()333 static uint64_t get_iid()
334 {
335 return iid++;
336 }
337
si_pps_clock_id(uint32_t gpu_id)338 static uint32_t si_pps_clock_id(uint32_t gpu_id)
339 {
340 char buf[40];
341 snprintf(buf, sizeof(buf), "org.freedesktop.mesa.amd.gpu%u", gpu_id);
342
343 return _mesa_hash_string(buf) | 0x80000000;
344 }
345
si_driver_ds_init(void)346 void si_driver_ds_init(void)
347 {
348 call_once(&si_driver_ds_once_flag, si_driver_ds_init_once);
349 si_gpu_tracepoint_config_variable();
350 }
351
si_ds_device_init(struct si_ds_device * device,const struct radeon_info * devinfo,uint32_t gpu_id,enum amd_ds_api api)352 void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo,
353 uint32_t gpu_id, enum amd_ds_api api)
354 {
355 device->gpu_id = gpu_id;
356 device->gpu_clock_id = si_pps_clock_id(gpu_id);
357 device->info = devinfo;
358 device->iid = get_iid();
359 device->api = api;
360 list_inithead(&device->queues);
361 }
362
si_ds_device_fini(struct si_ds_device * device)363 void si_ds_device_fini(struct si_ds_device *device)
364 {
365 u_trace_context_fini(&device->trace_context);
366 }
367
si_ds_device_init_queue(struct si_ds_device * device,struct si_ds_queue * queue,const char * fmt_name,...)368 struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device,
369 struct si_ds_queue *queue,
370 const char *fmt_name, ...)
371 {
372 va_list ap;
373 queue->device = device;
374
375 va_start(ap, fmt_name);
376 vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap);
377 va_end(ap);
378
379 for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
380 queue->stages[s].queue_iid = get_iid();
381 queue->stages[s].stage_iid = get_iid();
382 }
383
384 list_add(&queue->link, &device->queues);
385
386 return queue;
387 }
388
si_ds_flush_data_init(struct si_ds_flush_data * data,struct si_ds_queue * queue,uint64_t submission_id)389 void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue,
390 uint64_t submission_id)
391 {
392 memset(data, 0, sizeof(*data));
393
394 data->queue = queue;
395 data->submission_id = submission_id;
396
397 u_trace_init(&data->trace, &queue->device->trace_context);
398 }
399
si_ds_flush_data_fini(struct si_ds_flush_data * data)400 void si_ds_flush_data_fini(struct si_ds_flush_data *data)
401 {
402 u_trace_fini(&data->trace);
403 }
404
405 #ifdef __cplusplus
406 }
407 #endif
408