1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 #include <stdio.h>
7 #include <stdarg.h>
8 #include <string.h>
9
10 #include "util/hash_table.h"
11 #include "util/u_process.h"
12 #include "util/hash_table.h"
13
14 #include "si_pipe.h"
15 #include "si_perfetto.h"
16 #include "si_tracepoints.h"
17
18 #ifdef HAVE_PERFETTO
19
20 #include "util/perf/u_perfetto.h"
21 #include "util/perf/u_perfetto_renderpass.h"
22
23 #include "si_tracepoints_perfetto.h"
24
25 /* Just naming stages */
26 static const struct {
27 const char *name;
28
29 /* The perfetto UI requires that there is a parent-child relationship
30 * within a row of elements. Which means that all children elements must
31 * end within the lifespan of their parent.
32 *
33 * Some elements like stalls and command buffers follow that relationship,
34 * but not all. This tells us in which UI row the elements should live.
35 */
36 enum si_ds_queue_stage draw_stage;
37 } si_queue_stage_desc[SI_DS_QUEUE_STAGE_N_STAGES] = {
38 /* Order must match the enum! */
39 {
40 "queue",
41 SI_DS_QUEUE_STAGE_QUEUE,
42 },
43 {
44 "compute",
45 SI_DS_QUEUE_STAGE_COMPUTE,
46 },
47 {
48 "draw",
49 SI_DS_QUEUE_STAGE_DRAW,
50 }
51 };
52
53 struct SIRenderpassIncrementalState {
54 bool was_cleared = true;
55 };
56
57 struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits {
58 using IncrementalStateType = SIRenderpassIncrementalState;
59 };
60
61 class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource,
62 SIRenderpassTraits> {
63 };
64
65 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
66 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
67
68 using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory;
69
sync_timestamp(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)70 static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
71 {
72 uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
73 uint64_t gpu_ts;
74
75 struct si_context *sctx = container_of(device, struct si_context, ds);
76 gpu_ts = sctx->screen->b.get_timestamp(&sctx->screen->b);
77
78
79 cpu_ts = perfetto::base::GetBootTimeNs().count();
80
81 if (cpu_ts < device->next_clock_sync_ns)
82 return;
83
84 PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id);
85
86 device->sync_gpu_ts = gpu_ts;
87 device->next_clock_sync_ns = cpu_ts + 1000000000ull;
88 MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::
89 EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
90 }
91
send_descriptors(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)92 static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx,
93 struct si_ds_device *device)
94 {
95 PERFETTO_LOG("Sending renderstage descriptors");
96
97 device->event_id = 0;
98 list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
99 for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) {
100 queue->stages[s].start_ns[0] = 0;
101 }
102 }
103
104 {
105 auto packet = ctx.NewTracePacket();
106
107 packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
108 packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
109 packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
110
111 auto interned_data = packet->set_interned_data();
112
113 {
114 auto desc = interned_data->add_graphics_contexts();
115 desc->set_iid(device->iid);
116 desc->set_pid(getpid());
117 switch (device->api) {
118 case AMD_DS_API_OPENGL:
119 desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::OPEN_GL);
120 break;
121 case AMD_DS_API_VULKAN:
122 desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN);
123 break;
124 default:
125 break;
126 }
127 }
128
129 /* Emit all the IID picked at device/queue creation. */
130 list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
131 for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
132 {
133 /* We put the stage number in there so that all rows are order
134 * by si_ds_queue_stage.
135 */
136 char name[100];
137 snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(),
138 queue->name, s, si_queue_stage_desc[s].name);
139
140 auto desc = interned_data->add_gpu_specifications();
141 desc->set_iid(queue->stages[s].queue_iid);
142 desc->set_name(name);
143 }
144 {
145 auto desc = interned_data->add_gpu_specifications();
146 desc->set_iid(queue->stages[s].stage_iid);
147 desc->set_name(si_queue_stage_desc[s].name);
148 }
149 }
150 }
151 }
152
153 device->next_clock_sync_ns = 0;
154 sync_timestamp(ctx, device);
155 }
156
begin_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id)157 static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id)
158 {
159 PERFETTO_LOG("begin event called - ts_ns=%" PRIu64, ts_ns);
160 uint32_t level = queue->stages[stage_id].level;
161 /* If we haven't managed to calibrate the alignment between GPU and CPU
162 * timestamps yet, then skip this trace, otherwise perfetto won't know
163 * what to do with it.
164 */
165 if (!queue->device->sync_gpu_ts) {
166 queue->stages[stage_id].start_ns[level] = 0;
167 return;
168 }
169
170 if (level >= (ARRAY_SIZE(queue->stages[stage_id].start_ns) - 1))
171 return;
172
173 queue->stages[stage_id].start_ns[level] = ts_ns;
174 queue->stages[stage_id].level++;
175 }
176
end_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id,uint32_t submission_id,const char * app_event,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)177 static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id,
178 uint32_t submission_id, const char *app_event, const void* payload = nullptr,
179 trace_payload_as_extra_func payload_as_extra = nullptr)
180 {
181 PERFETTO_LOG("end event called - ts_ns=%" PRIu64, ts_ns);
182 struct si_ds_device *device = queue->device;
183
184 /* If we haven't managed to calibrate the alignment between GPU and CPU
185 * timestamps yet, then skip this trace, otherwise perfetto won't know
186 * what to do with it.
187 */
188 if (!device->sync_gpu_ts)
189 return;
190
191 if (queue->stages[stage_id].level == 0)
192 return;
193
194 uint32_t level = --queue->stages[stage_id].level;
195 struct si_ds_stage *stage = &queue->stages[stage_id];
196 uint64_t start_ns = stage->start_ns[level];
197 PERFETTO_LOG("end event called - start_ns=%" PRIu64 " ts_ns=%" PRIu64, start_ns, ts_ns);
198 if (!start_ns || start_ns > ts_ns)
199 return;
200
201 SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
202 if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
203 send_descriptors(tctx, queue->device);
204 state->was_cleared = false;
205 }
206
207 sync_timestamp(tctx, queue->device);
208
209 uint64_t evt_id = device->event_id++;
210
211 /* If this is an application event, we might need to generate a new
212 * stage_iid if not already seen. Otherwise, it's a driver event and we
213 * have use the internal stage_iid.
214 */
215 uint64_t stage_iid = app_event ?
216 tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) :
217 stage->stage_iid;
218
219 auto packet = tctx.NewTracePacket();
220
221 packet->set_timestamp(start_ns);
222 packet->set_timestamp_clock_id(queue->device->gpu_clock_id);
223
224 assert(ts_ns >= start_ns);
225
226 auto event = packet->set_gpu_render_stage_event();
227 event->set_gpu_id(queue->device->gpu_id);
228
229 event->set_hw_queue_iid(stage->queue_iid);
230 event->set_stage_iid(stage_iid);
231 event->set_context(queue->device->iid);
232 event->set_event_id(evt_id);
233 event->set_duration(ts_ns - start_ns);
234 event->set_submission_id(submission_id);
235
236 if (payload && payload_as_extra) {
237 payload_as_extra(event, payload, nullptr);
238 }
239 });
240
241 stage->start_ns[level] = 0;
242 }
243
244 #endif /* HAVE_PERFETTO */
245
246 #ifdef __cplusplus
247 extern "C" {
248 #endif
249
250 #ifdef HAVE_PERFETTO
251
252 /*
253 * Trace callbacks, called from u_trace once the timestamps from GPU have been
254 * collected.
255 */
256
257 #define CREATE_DUAL_EVENT_CALLBACK(event_name, stage) \
258 void si_ds_begin_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \
259 const void *flush_data, \
260 const struct trace_si_begin_##event_name *payload, \
261 const void *indirect_data) \
262 { \
263 const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \
264 begin_event(flush->queue, ts_ns, stage); \
265 } \
266 \
267 void si_ds_end_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \
268 const void *flush_data, \
269 const struct trace_si_end_##event_name *payload, \
270 const void *indirect_data) \
271 { \
272 const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \
273 end_event(flush->queue, ts_ns, stage, flush->submission_id, NULL, payload, \
274 (trace_payload_as_extra_func)&trace_payload_as_extra_si_end_##event_name); \
275 } \
276
CREATE_DUAL_EVENT_CALLBACK(draw,SI_DS_QUEUE_STAGE_DRAW)277 CREATE_DUAL_EVENT_CALLBACK(draw, SI_DS_QUEUE_STAGE_DRAW)
278 CREATE_DUAL_EVENT_CALLBACK(compute, SI_DS_QUEUE_STAGE_COMPUTE)
279
280 uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
281 {
282 return perfetto::base::GetBootTimeNs().count();
283 }
284
si_ds_end_submit(struct si_ds_queue * queue,uint64_t start_ts)285 void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
286 {
287 if (!u_trace_should_process(&queue->device->trace_context)) {
288 queue->device->sync_gpu_ts = 0;
289 queue->device->next_clock_sync_ns = 0;
290 return;
291 }
292
293 uint64_t end_ts = perfetto::base::GetBootTimeNs().count();
294 uint32_t submission_id = queue->submission_id++;
295
296 SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
297 if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
298 send_descriptors(tctx, queue->device);
299 state->was_cleared = false;
300 }
301
302 sync_timestamp(tctx, queue->device);
303
304 auto packet = tctx.NewTracePacket();
305
306 packet->set_timestamp(start_ts);
307
308 auto event = packet->set_vulkan_api_event();
309 auto submit = event->set_vk_queue_submit();
310
311 submit->set_duration_ns(end_ts - start_ts);
312 submit->set_vk_queue((uintptr_t) queue);
313 submit->set_submission_id(submission_id);
314 });
315 }
316
317 #endif /* HAVE_PERFETTO */
318
si_driver_ds_init_once(void)319 static void si_driver_ds_init_once(void)
320 {
321 #ifdef HAVE_PERFETTO
322 util_perfetto_init();
323 perfetto::DataSourceDescriptor dsd;
324 dsd.set_name("gpu.renderstages.amd");
325 SIRenderpassDataSource::Register(dsd);
326 #endif
327 }
328
329 static once_flag si_driver_ds_once_flag = ONCE_FLAG_INIT;
330 static uint64_t iid = 1;
331
get_iid()332 static uint64_t get_iid()
333 {
334 return iid++;
335 }
336
si_pps_clock_id(uint32_t gpu_id)337 static uint32_t si_pps_clock_id(uint32_t gpu_id)
338 {
339 char buf[40];
340 snprintf(buf, sizeof(buf), "org.freedesktop.mesa.amd.gpu%u", gpu_id);
341
342 return _mesa_hash_string(buf) | 0x80000000;
343 }
344
si_driver_ds_init(void)345 void si_driver_ds_init(void)
346 {
347 call_once(&si_driver_ds_once_flag, si_driver_ds_init_once);
348 si_gpu_tracepoint_config_variable();
349 }
350
si_ds_device_init(struct si_ds_device * device,const struct radeon_info * devinfo,uint32_t gpu_id,enum amd_ds_api api)351 void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo,
352 uint32_t gpu_id, enum amd_ds_api api)
353 {
354 device->gpu_id = gpu_id;
355 device->gpu_clock_id = si_pps_clock_id(gpu_id);
356 device->info = devinfo;
357 device->iid = get_iid();
358 device->api = api;
359 list_inithead(&device->queues);
360 }
361
si_ds_device_fini(struct si_ds_device * device)362 void si_ds_device_fini(struct si_ds_device *device)
363 {
364 u_trace_context_fini(&device->trace_context);
365 }
366
si_ds_device_init_queue(struct si_ds_device * device,struct si_ds_queue * queue,const char * fmt_name,...)367 struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device,
368 struct si_ds_queue *queue,
369 const char *fmt_name, ...)
370 {
371 va_list ap;
372 queue->device = device;
373
374 va_start(ap, fmt_name);
375 vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap);
376 va_end(ap);
377
378 for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
379 queue->stages[s].queue_iid = get_iid();
380 queue->stages[s].stage_iid = get_iid();
381 }
382
383 list_add(&queue->link, &device->queues);
384
385 return queue;
386 }
387
si_ds_flush_data_init(struct si_ds_flush_data * data,struct si_ds_queue * queue,uint64_t submission_id)388 void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue,
389 uint64_t submission_id)
390 {
391 memset(data, 0, sizeof(*data));
392
393 data->queue = queue;
394 data->submission_id = submission_id;
395
396 u_trace_init(&data->trace, &queue->device->trace_context);
397 }
398
si_ds_flush_data_fini(struct si_ds_flush_data * data)399 void si_ds_flush_data_fini(struct si_ds_flush_data *data)
400 {
401 u_trace_fini(&data->trace);
402 }
403
404 #ifdef __cplusplus
405 }
406 #endif
407