1 /*
2 * Copyright © 2021 Google, Inc.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include <perfetto.h>
7
8 #include "tu_perfetto.h"
9 #include "tu_device.h"
10
11 #include "util/hash_table.h"
12 #include "util/perf/u_perfetto.h"
13 #include "util/perf/u_perfetto_renderpass.h"
14
15 #include "tu_tracepoints.h"
16 #include "tu_tracepoints_perfetto.h"
17
18 /* we can't include tu_knl.h and tu_device.h */
19
20 int
21 tu_device_get_gpu_timestamp(struct tu_device *dev,
22 uint64_t *ts);
23 int
24 tu_device_get_suspend_count(struct tu_device *dev,
25 uint64_t *suspend_count);
26 uint64_t
27 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
28
29 struct u_trace_context *
30 tu_device_get_u_trace(struct tu_device *device);
31
32 /**
33 * Queue-id's
34 */
35 enum {
36 DEFAULT_HW_QUEUE_ID,
37 };
38
39 /**
40 * Render-stage id's
41 */
42 enum tu_stage_id {
43 CMD_BUFFER_STAGE_ID,
44 CMD_BUFFER_ANNOTATION_STAGE_ID,
45 RENDER_PASS_STAGE_ID,
46 CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
47 BINNING_STAGE_ID,
48 GMEM_STAGE_ID,
49 BYPASS_STAGE_ID,
50 BLIT_STAGE_ID,
51 COMPUTE_STAGE_ID,
52 CLEAR_SYSMEM_STAGE_ID,
53 CLEAR_GMEM_STAGE_ID,
54 GMEM_LOAD_STAGE_ID,
55 GMEM_STORE_STAGE_ID,
56 SYSMEM_RESOLVE_STAGE_ID,
57 // TODO add the rest from fd_stage_id
58 };
59
60 static const struct {
61 const char *name;
62 const char *desc;
63 } queues[] = {
64 [DEFAULT_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"},
65 };
66
67 static const struct {
68 const char *name;
69 const char *desc;
70 } stages[] = {
71 [CMD_BUFFER_STAGE_ID] = { "Command Buffer" },
72 [CMD_BUFFER_ANNOTATION_STAGE_ID] = { "Annotation", "Command Buffer Annotation" },
73 [RENDER_PASS_STAGE_ID] = { "Render Pass" },
74 [CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID] = { "Annotation", "Render Pass Command Buffer Annotation" },
75 [BINNING_STAGE_ID] = { "Binning", "Perform Visibility pass and determine target bins" },
76 [GMEM_STAGE_ID] = { "GMEM", "Rendering to GMEM" },
77 [BYPASS_STAGE_ID] = { "Bypass", "Rendering to system memory" },
78 [BLIT_STAGE_ID] = { "Blit", "Performing a Blit operation" },
79 [COMPUTE_STAGE_ID] = { "Compute", "Compute job" },
80 [CLEAR_SYSMEM_STAGE_ID] = { "Clear Sysmem", "" },
81 [CLEAR_GMEM_STAGE_ID] = { "Clear GMEM", "Per-tile (GMEM) clear" },
82 [GMEM_LOAD_STAGE_ID] = { "GMEM Load", "Per tile system memory to GMEM load" },
83 [GMEM_STORE_STAGE_ID] = { "GMEM Store", "Per tile GMEM to system memory store" },
84 [SYSMEM_RESOLVE_STAGE_ID] = { "SysMem Resolve", "System memory MSAA resolve" },
85 // TODO add the rest
86 };
87
88 static uint32_t gpu_clock_id;
89 static uint64_t next_clock_sync_ns; /* cpu time of next clk sync */
90
91 /**
92 * The timestamp at the point where we first emitted the clock_sync..
93 * this will be a *later* timestamp that the first GPU traces (since
94 * we capture the first clock_sync from the CPU *after* the first GPU
95 * tracepoints happen). To avoid confusing perfetto we need to drop
96 * the GPU traces with timestamps before this.
97 */
98 static uint64_t sync_gpu_ts;
99
100 static uint64_t last_suspend_count;
101
102 static uint64_t gpu_max_timestamp;
103 static uint64_t gpu_timestamp_offset;
104
105 struct TuRenderpassIncrementalState {
106 bool was_cleared = true;
107 };
108
109 struct TuRenderpassTraits : public perfetto::DefaultDataSourceTraits {
110 using IncrementalStateType = TuRenderpassIncrementalState;
111 };
112
113 class TuRenderpassDataSource : public MesaRenderpassDataSource<TuRenderpassDataSource,
114 TuRenderpassTraits> {
OnStart(const StartArgs & args)115 void OnStart(const StartArgs &args) override
116 {
117 MesaRenderpassDataSource<TuRenderpassDataSource, TuRenderpassTraits>::OnStart(args);
118
119 /* Note: clock_id's below 128 are reserved.. for custom clock sources,
120 * using the hash of a namespaced string is the recommended approach.
121 * See: https://perfetto.dev/docs/concepts/clock-sync
122 */
123 gpu_clock_id =
124 _mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000;
125
126 gpu_timestamp_offset = 0;
127 gpu_max_timestamp = 0;
128 last_suspend_count = 0;
129 }
130 };
131
132 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
133 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
134
135 static void
send_descriptors(TuRenderpassDataSource::TraceContext & ctx)136 send_descriptors(TuRenderpassDataSource::TraceContext &ctx)
137 {
138 PERFETTO_LOG("Sending renderstage descriptors");
139
140 auto packet = ctx.NewTracePacket();
141
142 /* This must be set before interned data is sent. */
143 packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
144
145 packet->set_timestamp(0);
146
147 auto event = packet->set_gpu_render_stage_event();
148 event->set_gpu_id(0);
149
150 auto spec = event->set_specifications();
151
152 for (unsigned i = 0; i < ARRAY_SIZE(queues); i++) {
153 auto desc = spec->add_hw_queue();
154
155 desc->set_name(queues[i].name);
156 desc->set_description(queues[i].desc);
157 }
158
159 for (unsigned i = 0; i < ARRAY_SIZE(stages); i++) {
160 auto desc = spec->add_stage();
161
162 desc->set_name(stages[i].name);
163 if (stages[i].desc)
164 desc->set_description(stages[i].desc);
165 }
166 }
167
168 static struct tu_perfetto_stage *
stage_push(struct tu_device * dev)169 stage_push(struct tu_device *dev)
170 {
171 struct tu_perfetto_state *p = &dev->perfetto;
172
173 if (p->stage_depth >= ARRAY_SIZE(p->stages)) {
174 p->skipped_depth++;
175 return NULL;
176 }
177
178 return &p->stages[p->stage_depth++];
179 }
180
181 typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
182
183 static struct tu_perfetto_stage *
stage_pop(struct tu_device * dev)184 stage_pop(struct tu_device *dev)
185 {
186 struct tu_perfetto_state *p = &dev->perfetto;
187
188 if (!p->stage_depth)
189 return NULL;
190
191 if (p->skipped_depth) {
192 p->skipped_depth--;
193 return NULL;
194 }
195
196 return &p->stages[--p->stage_depth];
197 }
198
199 static void
stage_start(struct tu_device * dev,uint64_t ts_ns,enum tu_stage_id stage_id,const char * app_event,const void * payload=nullptr,size_t payload_size=0,trace_payload_as_extra_func payload_as_extra=nullptr)200 stage_start(struct tu_device *dev,
201 uint64_t ts_ns,
202 enum tu_stage_id stage_id,
203 const char *app_event,
204 const void *payload = nullptr,
205 size_t payload_size = 0,
206 trace_payload_as_extra_func payload_as_extra = nullptr)
207 {
208 struct tu_perfetto_stage *stage = stage_push(dev);
209
210 if (!stage) {
211 PERFETTO_ELOG("stage %d is nested too deep", stage_id);
212 return;
213 }
214
215 if (payload) {
216 void* new_payload = malloc(payload_size);
217 if (new_payload)
218 memcpy(new_payload, payload, payload_size);
219 else
220 PERFETTO_ELOG("Failed to allocate payload for stage %d", stage_id);
221 payload = new_payload;
222 }
223
224 *stage = (struct tu_perfetto_stage) {
225 .stage_id = stage_id,
226 .stage_iid = 0,
227 .start_ts = ts_ns,
228 .payload = payload,
229 .start_payload_function = (void *) payload_as_extra,
230 };
231
232 if (app_event) {
233 TuRenderpassDataSource::Trace([=](auto tctx) {
234 stage->stage_iid =
235 tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event);
236 });
237 }
238 }
239
240 static void
stage_end(struct tu_device * dev,uint64_t ts_ns,enum tu_stage_id stage_id,const void * flush_data,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)241 stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
242 const void *flush_data,
243 const void* payload = nullptr,
244 trace_payload_as_extra_func payload_as_extra = nullptr)
245 {
246 struct tu_perfetto_stage *stage = stage_pop(dev);
247 auto trace_flush_data =
248 (const struct tu_u_trace_submission_data *) flush_data;
249 uint32_t submission_id = trace_flush_data->submission_id;
250 uint64_t gpu_ts_offset = trace_flush_data->gpu_ts_offset;
251
252 if (!stage)
253 return;
254
255 if (stage->stage_id != stage_id) {
256 PERFETTO_ELOG("stage %d ended while stage %d is expected",
257 stage_id, stage->stage_id);
258 return;
259 }
260
261 /* If we haven't managed to calibrate the alignment between GPU and CPU
262 * timestamps yet, then skip this trace, otherwise perfetto won't know
263 * what to do with it.
264 */
265 if (!sync_gpu_ts)
266 return;
267
268 TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
269 if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
270 send_descriptors(tctx);
271 state->was_cleared = false;
272 }
273
274 auto packet = tctx.NewTracePacket();
275
276 gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_ts_offset);
277
278 packet->set_timestamp(stage->start_ts + gpu_ts_offset);
279 packet->set_timestamp_clock_id(gpu_clock_id);
280
281 auto event = packet->set_gpu_render_stage_event();
282 event->set_event_id(0); // ???
283 event->set_hw_queue_id(DEFAULT_HW_QUEUE_ID);
284 event->set_duration(ts_ns - stage->start_ts);
285 if (stage->stage_iid)
286 event->set_stage_iid(stage->stage_iid);
287 else
288 event->set_stage_id(stage->stage_id);
289 event->set_context((uintptr_t) dev);
290 event->set_submission_id(submission_id);
291
292 if (stage->payload) {
293 if (stage->start_payload_function)
294 ((trace_payload_as_extra_func) stage->start_payload_function)(
295 event, stage->payload);
296 free((void *)stage->payload);
297 }
298
299 if (payload && payload_as_extra)
300 payload_as_extra(event, payload);
301 });
302 }
303
304 #ifdef __cplusplus
305 extern "C" {
306 #endif
307
308 void
tu_perfetto_init(void)309 tu_perfetto_init(void)
310 {
311 util_perfetto_init();
312
313 perfetto::DataSourceDescriptor dsd;
314 #if DETECT_OS_ANDROID
315 /* AGI requires this name */
316 dsd.set_name("gpu.renderstages");
317 #else
318 dsd.set_name("gpu.renderstages.msm");
319 #endif
320 TuRenderpassDataSource::Register(dsd);
321 }
322
323 static void
emit_sync_timestamp(uint64_t cpu_ts,uint64_t gpu_ts)324 emit_sync_timestamp(uint64_t cpu_ts, uint64_t gpu_ts)
325 {
326 TuRenderpassDataSource::Trace([=](auto tctx) {
327 MesaRenderpassDataSource<TuRenderpassDataSource,
328 TuRenderpassTraits>::EmitClockSync(tctx, cpu_ts,
329 gpu_ts, gpu_clock_id);
330 });
331 }
332
333 static void
emit_submit_id(uint32_t submission_id)334 emit_submit_id(uint32_t submission_id)
335 {
336 TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
337 auto packet = tctx.NewTracePacket();
338
339 packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
340
341 auto event = packet->set_vulkan_api_event();
342 auto submit = event->set_vk_queue_submit();
343
344 submit->set_submission_id(submission_id);
345 });
346 }
347
348 struct tu_perfetto_clocks
tu_perfetto_submit(struct tu_device * dev,uint32_t submission_id,struct tu_perfetto_clocks * gpu_clocks)349 tu_perfetto_submit(struct tu_device *dev,
350 uint32_t submission_id,
351 struct tu_perfetto_clocks *gpu_clocks)
352 {
353 struct tu_perfetto_clocks clocks {};
354 if (gpu_clocks) {
355 clocks = *gpu_clocks;
356 }
357
358 if (!u_trace_perfetto_active(tu_device_get_u_trace(dev)))
359 return {};
360
361 clocks.cpu = perfetto::base::GetBootTimeNs().count();
362
363 if (gpu_clocks) {
364 /* TODO: It would be better to use CPU time that comes
365 * together with GPU time from the KGSL, but it's not
366 * equal to GetBootTimeNs.
367 */
368
369 clocks.gpu_ts_offset = MAX2(gpu_timestamp_offset, clocks.gpu_ts_offset);
370 gpu_timestamp_offset = clocks.gpu_ts_offset;
371 sync_gpu_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
372 } else {
373 clocks.gpu_ts = 0;
374 clocks.gpu_ts_offset = gpu_timestamp_offset;
375
376 if (clocks.cpu < next_clock_sync_ns)
377 return clocks;
378
379 if (tu_device_get_gpu_timestamp(dev, &clocks.gpu_ts)) {
380 PERFETTO_ELOG("Could not sync CPU and GPU clocks");
381 return {};
382 }
383
384 clocks.gpu_ts = tu_device_ticks_to_ns(dev, clocks.gpu_ts);
385
386 /* get cpu timestamp again because tu_device_get_gpu_timestamp can take
387 * >100us
388 */
389 clocks.cpu = perfetto::base::GetBootTimeNs().count();
390
391 uint64_t current_suspend_count = 0;
392 /* If we fail to get it we will use a fallback */
393 tu_device_get_suspend_count(dev, ¤t_suspend_count);
394
395 /* GPU timestamp is being reset after suspend-resume cycle.
396 * Perfetto requires clock snapshots to be monotonic,
397 * so we have to fix-up the time.
398 */
399 if (current_suspend_count != last_suspend_count) {
400 gpu_timestamp_offset = gpu_max_timestamp;
401 last_suspend_count = current_suspend_count;
402 }
403 clocks.gpu_ts_offset = gpu_timestamp_offset;
404
405 uint64_t gpu_absolute_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
406
407 /* Fallback check, detect non-monotonic cases which would happen
408 * if we cannot retrieve suspend count.
409 */
410 if (sync_gpu_ts > gpu_absolute_ts) {
411 gpu_absolute_ts += (gpu_max_timestamp - gpu_timestamp_offset);
412 gpu_timestamp_offset = gpu_max_timestamp;
413 clocks.gpu_ts = gpu_absolute_ts - gpu_timestamp_offset;
414 }
415
416 if (sync_gpu_ts > gpu_absolute_ts) {
417 PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out");
418 return {};
419 }
420
421 gpu_max_timestamp = clocks.gpu_ts;
422 sync_gpu_ts = clocks.gpu_ts;
423 next_clock_sync_ns = clocks.cpu + 30000000;
424 }
425
426 emit_sync_timestamp(clocks.cpu, clocks.gpu_ts + clocks.gpu_ts_offset);
427 emit_submit_id(submission_id);
428 return clocks;
429 }
430
431 /*
432 * Trace callbacks, called from u_trace once the timestamps from GPU have been
433 * collected.
434 *
435 * The default "extra" funcs are code-generated into tu_tracepoints_perfetto.h
436 * and just take the tracepoint's args and add them as name/value pairs in the
437 * perfetto events. This file can usually just map a tu_perfetto_* to
438 * stage_start/end with a call to that codegenned "extra" func. But you can
439 * also provide your own entrypoint and extra funcs if you want to change that
440 * mapping.
441 */
442
443 #define CREATE_EVENT_CALLBACK(event_name, stage_id) \
444 void tu_perfetto_start_##event_name( \
445 struct tu_device *dev, uint64_t ts_ns, uint16_t tp_idx, \
446 const void *flush_data, const struct trace_start_##event_name *payload) \
447 { \
448 stage_start( \
449 dev, ts_ns, stage_id, NULL, payload, sizeof(*payload), \
450 (trace_payload_as_extra_func) &trace_payload_as_extra_start_##event_name); \
451 } \
452 \
453 void tu_perfetto_end_##event_name( \
454 struct tu_device *dev, uint64_t ts_ns, uint16_t tp_idx, \
455 const void *flush_data, const struct trace_end_##event_name *payload) \
456 { \
457 stage_end( \
458 dev, ts_ns, stage_id, flush_data, payload, \
459 (trace_payload_as_extra_func) &trace_payload_as_extra_end_##event_name); \
460 }
461
CREATE_EVENT_CALLBACK(cmd_buffer,CMD_BUFFER_STAGE_ID)462 CREATE_EVENT_CALLBACK(cmd_buffer, CMD_BUFFER_STAGE_ID)
463 CREATE_EVENT_CALLBACK(render_pass, RENDER_PASS_STAGE_ID)
464 CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID)
465 CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID)
466 CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID)
467 CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID)
468 CREATE_EVENT_CALLBACK(compute, COMPUTE_STAGE_ID)
469 CREATE_EVENT_CALLBACK(gmem_clear, CLEAR_GMEM_STAGE_ID)
470 CREATE_EVENT_CALLBACK(sysmem_clear, CLEAR_SYSMEM_STAGE_ID)
471 CREATE_EVENT_CALLBACK(sysmem_clear_all, CLEAR_SYSMEM_STAGE_ID)
472 CREATE_EVENT_CALLBACK(gmem_load, GMEM_LOAD_STAGE_ID)
473 CREATE_EVENT_CALLBACK(gmem_store, GMEM_STORE_STAGE_ID)
474 CREATE_EVENT_CALLBACK(sysmem_resolve, SYSMEM_RESOLVE_STAGE_ID)
475
476 void
477 tu_perfetto_start_cmd_buffer_annotation(
478 struct tu_device *dev,
479 uint64_t ts_ns,
480 uint16_t tp_idx,
481 const void *flush_data,
482 const struct trace_start_cmd_buffer_annotation *payload)
483 {
484 /* No extra func necessary, the only arg is in the end payload.*/
485 stage_start(dev, ts_ns, CMD_BUFFER_ANNOTATION_STAGE_ID, payload->str, payload,
486 sizeof(*payload), NULL);
487 }
488
489 void
tu_perfetto_end_cmd_buffer_annotation(struct tu_device * dev,uint64_t ts_ns,uint16_t tp_idx,const void * flush_data,const struct trace_end_cmd_buffer_annotation * payload)490 tu_perfetto_end_cmd_buffer_annotation(
491 struct tu_device *dev,
492 uint64_t ts_ns,
493 uint16_t tp_idx,
494 const void *flush_data,
495 const struct trace_end_cmd_buffer_annotation *payload)
496 {
497 /* Pass the payload string as the app_event, which will appear right on the
498 * event block, rather than as metadata inside.
499 */
500 stage_end(dev, ts_ns, CMD_BUFFER_ANNOTATION_STAGE_ID, flush_data,
501 payload, NULL);
502 }
503
504 void
tu_perfetto_start_cmd_buffer_annotation_rp(struct tu_device * dev,uint64_t ts_ns,uint16_t tp_idx,const void * flush_data,const struct trace_start_cmd_buffer_annotation_rp * payload)505 tu_perfetto_start_cmd_buffer_annotation_rp(
506 struct tu_device *dev,
507 uint64_t ts_ns,
508 uint16_t tp_idx,
509 const void *flush_data,
510 const struct trace_start_cmd_buffer_annotation_rp *payload)
511 {
512 /* No extra func necessary, the only arg is in the end payload.*/
513 stage_start(dev, ts_ns, CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
514 payload->str, payload, sizeof(*payload), NULL);
515 }
516
517 void
tu_perfetto_end_cmd_buffer_annotation_rp(struct tu_device * dev,uint64_t ts_ns,uint16_t tp_idx,const void * flush_data,const struct trace_end_cmd_buffer_annotation_rp * payload)518 tu_perfetto_end_cmd_buffer_annotation_rp(
519 struct tu_device *dev,
520 uint64_t ts_ns,
521 uint16_t tp_idx,
522 const void *flush_data,
523 const struct trace_end_cmd_buffer_annotation_rp *payload)
524 {
525 /* Pass the payload string as the app_event, which will appear right on the
526 * event block, rather than as metadata inside.
527 */
528 stage_end(dev, ts_ns, CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
529 flush_data, payload, NULL);
530 }
531
532 #ifdef __cplusplus
533 }
534 #endif
535