• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Google, Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include <perfetto.h>
7 
8 #include "tu_perfetto.h"
9 #include "tu_device.h"
10 
11 #include "util/hash_table.h"
12 #include "util/perf/u_perfetto.h"
13 #include "util/perf/u_perfetto_renderpass.h"
14 
15 #include "tu_tracepoints.h"
16 #include "tu_tracepoints_perfetto.h"
17 
18 /* we can't include tu_knl.h and tu_device.h */
19 
20 int
21 tu_device_get_gpu_timestamp(struct tu_device *dev,
22                             uint64_t *ts);
23 int
24 tu_device_get_suspend_count(struct tu_device *dev,
25                             uint64_t *suspend_count);
26 uint64_t
27 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
28 
29 struct u_trace_context *
30 tu_device_get_u_trace(struct tu_device *device);
31 
32 /**
33  * Queue-id's
34  */
35 enum {
36    DEFAULT_HW_QUEUE_ID,
37 };
38 
39 /**
40  * Render-stage id's
41  */
42 enum tu_stage_id {
43    CMD_BUFFER_STAGE_ID,
44    CMD_BUFFER_ANNOTATION_STAGE_ID,
45    RENDER_PASS_STAGE_ID,
46    CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
47    BINNING_STAGE_ID,
48    GMEM_STAGE_ID,
49    BYPASS_STAGE_ID,
50    BLIT_STAGE_ID,
51    COMPUTE_STAGE_ID,
52    CLEAR_SYSMEM_STAGE_ID,
53    CLEAR_GMEM_STAGE_ID,
54    GMEM_LOAD_STAGE_ID,
55    GMEM_STORE_STAGE_ID,
56    SYSMEM_RESOLVE_STAGE_ID,
57    // TODO add the rest from fd_stage_id
58 };
59 
60 static const struct {
61    const char *name;
62    const char *desc;
63 } queues[] = {
64    [DEFAULT_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"},
65 };
66 
67 static const struct {
68    const char *name;
69    const char *desc;
70 } stages[] = {
71    [CMD_BUFFER_STAGE_ID]     = { "Command Buffer" },
72    [CMD_BUFFER_ANNOTATION_STAGE_ID]     = { "Annotation", "Command Buffer Annotation" },
73    [RENDER_PASS_STAGE_ID]    = { "Render Pass" },
74    [CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID]    = { "Annotation", "Render Pass Command Buffer Annotation" },
75    [BINNING_STAGE_ID]        = { "Binning", "Perform Visibility pass and determine target bins" },
76    [GMEM_STAGE_ID]           = { "GMEM", "Rendering to GMEM" },
77    [BYPASS_STAGE_ID]         = { "Bypass", "Rendering to system memory" },
78    [BLIT_STAGE_ID]           = { "Blit", "Performing a Blit operation" },
79    [COMPUTE_STAGE_ID]        = { "Compute", "Compute job" },
80    [CLEAR_SYSMEM_STAGE_ID]   = { "Clear Sysmem", "" },
81    [CLEAR_GMEM_STAGE_ID]     = { "Clear GMEM", "Per-tile (GMEM) clear" },
82    [GMEM_LOAD_STAGE_ID]      = { "GMEM Load", "Per tile system memory to GMEM load" },
83    [GMEM_STORE_STAGE_ID]     = { "GMEM Store", "Per tile GMEM to system memory store" },
84    [SYSMEM_RESOLVE_STAGE_ID] = { "SysMem Resolve", "System memory MSAA resolve" },
85    // TODO add the rest
86 };
87 
88 static uint32_t gpu_clock_id;
89 static uint64_t next_clock_sync_ns; /* cpu time of next clk sync */
90 
91 /**
92  * The timestamp at the point where we first emitted the clock_sync..
93  * this  will be a *later* timestamp that the first GPU traces (since
94  * we capture the first clock_sync from the CPU *after* the first GPU
95  * tracepoints happen).  To avoid confusing perfetto we need to drop
96  * the GPU traces with timestamps before this.
97  */
98 static uint64_t sync_gpu_ts;
99 
100 static uint64_t last_suspend_count;
101 
102 static uint64_t gpu_max_timestamp;
103 static uint64_t gpu_timestamp_offset;
104 
105 struct TuRenderpassIncrementalState {
106    bool was_cleared = true;
107 };
108 
109 struct TuRenderpassTraits : public perfetto::DefaultDataSourceTraits {
110    using IncrementalStateType = TuRenderpassIncrementalState;
111 };
112 
113 class TuRenderpassDataSource : public MesaRenderpassDataSource<TuRenderpassDataSource,
114                                                                TuRenderpassTraits> {
OnStart(const StartArgs & args)115    void OnStart(const StartArgs &args) override
116    {
117       MesaRenderpassDataSource<TuRenderpassDataSource, TuRenderpassTraits>::OnStart(args);
118 
119       /* Note: clock_id's below 128 are reserved.. for custom clock sources,
120        * using the hash of a namespaced string is the recommended approach.
121        * See: https://perfetto.dev/docs/concepts/clock-sync
122        */
123       gpu_clock_id =
124          _mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000;
125 
126       gpu_timestamp_offset = 0;
127       gpu_max_timestamp = 0;
128       last_suspend_count = 0;
129    }
130 };
131 
132 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
133 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
134 
135 static void
send_descriptors(TuRenderpassDataSource::TraceContext & ctx)136 send_descriptors(TuRenderpassDataSource::TraceContext &ctx)
137 {
138    PERFETTO_LOG("Sending renderstage descriptors");
139 
140    auto packet = ctx.NewTracePacket();
141 
142    /* This must be set before interned data is sent. */
143    packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
144 
145    packet->set_timestamp(0);
146 
147    auto event = packet->set_gpu_render_stage_event();
148    event->set_gpu_id(0);
149 
150    auto spec = event->set_specifications();
151 
152    for (unsigned i = 0; i < ARRAY_SIZE(queues); i++) {
153       auto desc = spec->add_hw_queue();
154 
155       desc->set_name(queues[i].name);
156       desc->set_description(queues[i].desc);
157    }
158 
159    for (unsigned i = 0; i < ARRAY_SIZE(stages); i++) {
160       auto desc = spec->add_stage();
161 
162       desc->set_name(stages[i].name);
163       if (stages[i].desc)
164          desc->set_description(stages[i].desc);
165    }
166 }
167 
168 static struct tu_perfetto_stage *
stage_push(struct tu_device * dev)169 stage_push(struct tu_device *dev)
170 {
171    struct tu_perfetto_state *p = &dev->perfetto;
172 
173    if (p->stage_depth >= ARRAY_SIZE(p->stages)) {
174       p->skipped_depth++;
175       return NULL;
176    }
177 
178    return &p->stages[p->stage_depth++];
179 }
180 
181 typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
182 
183 static struct tu_perfetto_stage *
stage_pop(struct tu_device * dev)184 stage_pop(struct tu_device *dev)
185 {
186    struct tu_perfetto_state *p = &dev->perfetto;
187 
188    if (!p->stage_depth)
189       return NULL;
190 
191    if (p->skipped_depth) {
192       p->skipped_depth--;
193       return NULL;
194    }
195 
196    return &p->stages[--p->stage_depth];
197 }
198 
199 static void
stage_start(struct tu_device * dev,uint64_t ts_ns,enum tu_stage_id stage_id,const char * app_event,const void * payload=nullptr,size_t payload_size=0,trace_payload_as_extra_func payload_as_extra=nullptr)200 stage_start(struct tu_device *dev,
201             uint64_t ts_ns,
202             enum tu_stage_id stage_id,
203             const char *app_event,
204             const void *payload = nullptr,
205             size_t payload_size = 0,
206             trace_payload_as_extra_func payload_as_extra = nullptr)
207 {
208    struct tu_perfetto_stage *stage = stage_push(dev);
209 
210    if (!stage) {
211       PERFETTO_ELOG("stage %d is nested too deep", stage_id);
212       return;
213    }
214 
215    if (payload) {
216       void* new_payload = malloc(payload_size);
217       if (new_payload)
218          memcpy(new_payload, payload, payload_size);
219       else
220          PERFETTO_ELOG("Failed to allocate payload for stage %d", stage_id);
221       payload = new_payload;
222    }
223 
224    *stage = (struct tu_perfetto_stage) {
225       .stage_id = stage_id,
226       .stage_iid = 0,
227       .start_ts = ts_ns,
228       .payload = payload,
229       .start_payload_function = (void *) payload_as_extra,
230    };
231 
232    if (app_event) {
233       TuRenderpassDataSource::Trace([=](auto tctx) {
234          stage->stage_iid =
235             tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event);
236       });
237    }
238 }
239 
240 static void
stage_end(struct tu_device * dev,uint64_t ts_ns,enum tu_stage_id stage_id,const void * flush_data,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)241 stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
242           const void *flush_data,
243           const void* payload = nullptr,
244           trace_payload_as_extra_func payload_as_extra = nullptr)
245 {
246    struct tu_perfetto_stage *stage = stage_pop(dev);
247    auto trace_flush_data =
248       (const struct tu_u_trace_submission_data *) flush_data;
249    uint32_t submission_id = trace_flush_data->submission_id;
250    uint64_t gpu_ts_offset = trace_flush_data->gpu_ts_offset;
251 
252    if (!stage)
253       return;
254 
255    if (stage->stage_id != stage_id) {
256       PERFETTO_ELOG("stage %d ended while stage %d is expected",
257             stage_id, stage->stage_id);
258       return;
259    }
260 
261    /* If we haven't managed to calibrate the alignment between GPU and CPU
262     * timestamps yet, then skip this trace, otherwise perfetto won't know
263     * what to do with it.
264     */
265    if (!sync_gpu_ts)
266       return;
267 
268    TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
269       if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
270          send_descriptors(tctx);
271          state->was_cleared = false;
272       }
273 
274       auto packet = tctx.NewTracePacket();
275 
276       gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_ts_offset);
277 
278       packet->set_timestamp(stage->start_ts + gpu_ts_offset);
279       packet->set_timestamp_clock_id(gpu_clock_id);
280 
281       auto event = packet->set_gpu_render_stage_event();
282       event->set_event_id(0); // ???
283       event->set_hw_queue_id(DEFAULT_HW_QUEUE_ID);
284       event->set_duration(ts_ns - stage->start_ts);
285       if (stage->stage_iid)
286          event->set_stage_iid(stage->stage_iid);
287       else
288          event->set_stage_id(stage->stage_id);
289       event->set_context((uintptr_t) dev);
290       event->set_submission_id(submission_id);
291 
292       if (stage->payload) {
293          if (stage->start_payload_function)
294             ((trace_payload_as_extra_func) stage->start_payload_function)(
295                event, stage->payload);
296          free((void *)stage->payload);
297       }
298 
299       if (payload && payload_as_extra)
300          payload_as_extra(event, payload);
301    });
302 }
303 
304 #ifdef __cplusplus
305 extern "C" {
306 #endif
307 
308 void
tu_perfetto_init(void)309 tu_perfetto_init(void)
310 {
311    util_perfetto_init();
312 
313    perfetto::DataSourceDescriptor dsd;
314 #if DETECT_OS_ANDROID
315    /* AGI requires this name */
316    dsd.set_name("gpu.renderstages");
317 #else
318    dsd.set_name("gpu.renderstages.msm");
319 #endif
320    TuRenderpassDataSource::Register(dsd);
321 }
322 
323 static void
emit_sync_timestamp(uint64_t cpu_ts,uint64_t gpu_ts)324 emit_sync_timestamp(uint64_t cpu_ts, uint64_t gpu_ts)
325 {
326    TuRenderpassDataSource::Trace([=](auto tctx) {
327       MesaRenderpassDataSource<TuRenderpassDataSource,
328                                TuRenderpassTraits>::EmitClockSync(tctx, cpu_ts,
329                                                                   gpu_ts, gpu_clock_id);
330    });
331 }
332 
333 static void
emit_submit_id(uint32_t submission_id)334 emit_submit_id(uint32_t submission_id)
335 {
336    TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
337       auto packet = tctx.NewTracePacket();
338 
339       packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
340 
341       auto event = packet->set_vulkan_api_event();
342       auto submit = event->set_vk_queue_submit();
343 
344       submit->set_submission_id(submission_id);
345    });
346 }
347 
348 struct tu_perfetto_clocks
tu_perfetto_submit(struct tu_device * dev,uint32_t submission_id,struct tu_perfetto_clocks * gpu_clocks)349 tu_perfetto_submit(struct tu_device *dev,
350                    uint32_t submission_id,
351                    struct tu_perfetto_clocks *gpu_clocks)
352 {
353    struct tu_perfetto_clocks clocks {};
354    if (gpu_clocks) {
355       clocks = *gpu_clocks;
356    }
357 
358    if (!u_trace_perfetto_active(tu_device_get_u_trace(dev)))
359       return {};
360 
361    clocks.cpu = perfetto::base::GetBootTimeNs().count();
362 
363    if (gpu_clocks) {
364       /* TODO: It would be better to use CPU time that comes
365        * together with GPU time from the KGSL, but it's not
366        * equal to GetBootTimeNs.
367        */
368 
369       clocks.gpu_ts_offset = MAX2(gpu_timestamp_offset, clocks.gpu_ts_offset);
370       gpu_timestamp_offset = clocks.gpu_ts_offset;
371       sync_gpu_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
372    } else {
373       clocks.gpu_ts = 0;
374       clocks.gpu_ts_offset = gpu_timestamp_offset;
375 
376       if (clocks.cpu < next_clock_sync_ns)
377          return clocks;
378 
379       if (tu_device_get_gpu_timestamp(dev, &clocks.gpu_ts)) {
380          PERFETTO_ELOG("Could not sync CPU and GPU clocks");
381          return {};
382       }
383 
384       clocks.gpu_ts = tu_device_ticks_to_ns(dev, clocks.gpu_ts);
385 
386       /* get cpu timestamp again because tu_device_get_gpu_timestamp can take
387        * >100us
388        */
389       clocks.cpu = perfetto::base::GetBootTimeNs().count();
390 
391       uint64_t current_suspend_count = 0;
392       /* If we fail to get it we will use a fallback */
393       tu_device_get_suspend_count(dev, &current_suspend_count);
394 
395       /* GPU timestamp is being reset after suspend-resume cycle.
396        * Perfetto requires clock snapshots to be monotonic,
397        * so we have to fix-up the time.
398        */
399       if (current_suspend_count != last_suspend_count) {
400          gpu_timestamp_offset = gpu_max_timestamp;
401          last_suspend_count = current_suspend_count;
402       }
403       clocks.gpu_ts_offset = gpu_timestamp_offset;
404 
405       uint64_t gpu_absolute_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
406 
407       /* Fallback check, detect non-monotonic cases which would happen
408        * if we cannot retrieve suspend count.
409        */
410       if (sync_gpu_ts > gpu_absolute_ts) {
411          gpu_absolute_ts += (gpu_max_timestamp - gpu_timestamp_offset);
412          gpu_timestamp_offset = gpu_max_timestamp;
413          clocks.gpu_ts = gpu_absolute_ts - gpu_timestamp_offset;
414       }
415 
416       if (sync_gpu_ts > gpu_absolute_ts) {
417          PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out");
418          return {};
419       }
420 
421       gpu_max_timestamp = clocks.gpu_ts;
422       sync_gpu_ts = clocks.gpu_ts;
423       next_clock_sync_ns = clocks.cpu + 30000000;
424    }
425 
426    emit_sync_timestamp(clocks.cpu, clocks.gpu_ts + clocks.gpu_ts_offset);
427    emit_submit_id(submission_id);
428    return clocks;
429 }
430 
431 /*
432  * Trace callbacks, called from u_trace once the timestamps from GPU have been
433  * collected.
434  *
435  * The default "extra" funcs are code-generated into tu_tracepoints_perfetto.h
436  * and just take the tracepoint's args and add them as name/value pairs in the
437  * perfetto events.  This file can usually just map a tu_perfetto_* to
438  * stage_start/end with a call to that codegenned "extra" func.  But you can
439  * also provide your own entrypoint and extra funcs if you want to change that
440  * mapping.
441  */
442 
443 #define CREATE_EVENT_CALLBACK(event_name, stage_id)                                 \
444    void tu_perfetto_start_##event_name(                                             \
445       struct tu_device *dev, uint64_t ts_ns, uint16_t tp_idx,                       \
446       const void *flush_data, const struct trace_start_##event_name *payload)       \
447    {                                                                                \
448       stage_start(                                                                  \
449          dev, ts_ns, stage_id, NULL, payload, sizeof(*payload),                     \
450          (trace_payload_as_extra_func) &trace_payload_as_extra_start_##event_name); \
451    }                                                                                \
452                                                                                     \
453    void tu_perfetto_end_##event_name(                                               \
454       struct tu_device *dev, uint64_t ts_ns, uint16_t tp_idx,                       \
455       const void *flush_data, const struct trace_end_##event_name *payload)         \
456    {                                                                                \
457       stage_end(                                                                    \
458          dev, ts_ns, stage_id, flush_data, payload,                                 \
459          (trace_payload_as_extra_func) &trace_payload_as_extra_end_##event_name);   \
460    }
461 
CREATE_EVENT_CALLBACK(cmd_buffer,CMD_BUFFER_STAGE_ID)462 CREATE_EVENT_CALLBACK(cmd_buffer, CMD_BUFFER_STAGE_ID)
463 CREATE_EVENT_CALLBACK(render_pass, RENDER_PASS_STAGE_ID)
464 CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID)
465 CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID)
466 CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID)
467 CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID)
468 CREATE_EVENT_CALLBACK(compute, COMPUTE_STAGE_ID)
469 CREATE_EVENT_CALLBACK(gmem_clear, CLEAR_GMEM_STAGE_ID)
470 CREATE_EVENT_CALLBACK(sysmem_clear, CLEAR_SYSMEM_STAGE_ID)
471 CREATE_EVENT_CALLBACK(sysmem_clear_all, CLEAR_SYSMEM_STAGE_ID)
472 CREATE_EVENT_CALLBACK(gmem_load, GMEM_LOAD_STAGE_ID)
473 CREATE_EVENT_CALLBACK(gmem_store, GMEM_STORE_STAGE_ID)
474 CREATE_EVENT_CALLBACK(sysmem_resolve, SYSMEM_RESOLVE_STAGE_ID)
475 
476 void
477 tu_perfetto_start_cmd_buffer_annotation(
478    struct tu_device *dev,
479    uint64_t ts_ns,
480    uint16_t tp_idx,
481    const void *flush_data,
482    const struct trace_start_cmd_buffer_annotation *payload)
483 {
484    /* No extra func necessary, the only arg is in the end payload.*/
485    stage_start(dev, ts_ns, CMD_BUFFER_ANNOTATION_STAGE_ID, payload->str, payload,
486                sizeof(*payload), NULL);
487 }
488 
489 void
tu_perfetto_end_cmd_buffer_annotation(struct tu_device * dev,uint64_t ts_ns,uint16_t tp_idx,const void * flush_data,const struct trace_end_cmd_buffer_annotation * payload)490 tu_perfetto_end_cmd_buffer_annotation(
491    struct tu_device *dev,
492    uint64_t ts_ns,
493    uint16_t tp_idx,
494    const void *flush_data,
495    const struct trace_end_cmd_buffer_annotation *payload)
496 {
497    /* Pass the payload string as the app_event, which will appear right on the
498     * event block, rather than as metadata inside.
499     */
500    stage_end(dev, ts_ns, CMD_BUFFER_ANNOTATION_STAGE_ID, flush_data,
501              payload, NULL);
502 }
503 
504 void
tu_perfetto_start_cmd_buffer_annotation_rp(struct tu_device * dev,uint64_t ts_ns,uint16_t tp_idx,const void * flush_data,const struct trace_start_cmd_buffer_annotation_rp * payload)505 tu_perfetto_start_cmd_buffer_annotation_rp(
506    struct tu_device *dev,
507    uint64_t ts_ns,
508    uint16_t tp_idx,
509    const void *flush_data,
510    const struct trace_start_cmd_buffer_annotation_rp *payload)
511 {
512    /* No extra func necessary, the only arg is in the end payload.*/
513    stage_start(dev, ts_ns, CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
514                payload->str, payload, sizeof(*payload), NULL);
515 }
516 
517 void
tu_perfetto_end_cmd_buffer_annotation_rp(struct tu_device * dev,uint64_t ts_ns,uint16_t tp_idx,const void * flush_data,const struct trace_end_cmd_buffer_annotation_rp * payload)518 tu_perfetto_end_cmd_buffer_annotation_rp(
519    struct tu_device *dev,
520    uint64_t ts_ns,
521    uint16_t tp_idx,
522    const void *flush_data,
523    const struct trace_end_cmd_buffer_annotation_rp *payload)
524 {
525    /* Pass the payload string as the app_event, which will appear right on the
526     * event block, rather than as metadata inside.
527     */
528    stage_end(dev, ts_ns, CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
529              flush_data, payload, NULL);
530 }
531 
532 #ifdef __cplusplus
533 }
534 #endif
535