• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 #include <stdio.h>
7 #include <stdarg.h>
8 #include <string.h>
9 
10 #include "util/hash_table.h"
11 #include "util/u_process.h"
12 #include "util/hash_table.h"
13 
14 #include "si_pipe.h"
15 #include "si_perfetto.h"
16 #include "si_tracepoints.h"
17 
18 #ifdef HAVE_PERFETTO
19 
20 #include "util/perf/u_perfetto.h"
21 #include "util/perf/u_perfetto_renderpass.h"
22 
23 #include "si_tracepoints_perfetto.h"
24 
25 /* Just naming stages */
26 static const struct {
27    const char *name;
28 
29    /* The perfetto UI requires that there is a parent-child relationship
30     * within a row of elements. Which means that all children elements must
31     * end within the lifespan of their parent.
32     *
33     * Some elements like stalls and command buffers follow that relationship,
34     * but not all. This tells us in which UI row the elements should live.
35     */
36    enum si_ds_queue_stage draw_stage;
37 } si_queue_stage_desc[SI_DS_QUEUE_STAGE_N_STAGES] = {
38    /* Order must match the enum! */
39    {
40       "queue",
41       SI_DS_QUEUE_STAGE_QUEUE,
42    },
43    {
44       "compute",
45       SI_DS_QUEUE_STAGE_COMPUTE,
46    },
47    {
48       "draw",
49       SI_DS_QUEUE_STAGE_DRAW,
50    }
51 };
52 
53 struct SIRenderpassIncrementalState {
54    bool was_cleared = true;
55 };
56 
57 struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits {
58    using IncrementalStateType = SIRenderpassIncrementalState;
59 };
60 
61 class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource,
62                                                                SIRenderpassTraits> {
63 };
64 
65 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
66 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
67 
68 using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory;
69 
sync_timestamp(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)70 static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
71 {
72    uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
73    uint64_t gpu_ts;
74 
75    struct si_context *sctx = container_of(device, struct si_context, ds);
76    gpu_ts = sctx->screen->b.get_timestamp(&sctx->screen->b);
77 
78 
79    cpu_ts = perfetto::base::GetBootTimeNs().count();
80 
81    if (cpu_ts < device->next_clock_sync_ns)
82       return;
83 
84    PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id);
85 
86    device->sync_gpu_ts = gpu_ts;
87    device->next_clock_sync_ns = cpu_ts + 1000000000ull;
88    MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::
89       EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
90 }
91 
send_descriptors(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)92 static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx,
93                              struct si_ds_device *device)
94 {
95    PERFETTO_LOG("Sending renderstage descriptors");
96 
97    device->event_id = 0;
98    list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
99       for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) {
100          queue->stages[s].start_ns[0] = 0;
101       }
102    }
103 
104    {
105       auto packet = ctx.NewTracePacket();
106 
107       packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
108       packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
109       packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
110 
111       auto interned_data = packet->set_interned_data();
112 
113       {
114          auto desc = interned_data->add_graphics_contexts();
115          desc->set_iid(device->iid);
116          desc->set_pid(getpid());
117          switch (device->api) {
118          case AMD_DS_API_OPENGL:
119             desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::OPEN_GL);
120             break;
121          case AMD_DS_API_VULKAN:
122             desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN);
123             break;
124          default:
125             break;
126          }
127       }
128 
129       /* Emit all the IID picked at device/queue creation. */
130       list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
131          for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
132             {
133                /* We put the stage number in there so that all rows are order
134                 * by si_ds_queue_stage.
135                 */
136                char name[100];
137                snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(),
138                         queue->name, s, si_queue_stage_desc[s].name);
139 
140                auto desc = interned_data->add_gpu_specifications();
141                desc->set_iid(queue->stages[s].queue_iid);
142                desc->set_name(name);
143             }
144             {
145                auto desc = interned_data->add_gpu_specifications();
146                desc->set_iid(queue->stages[s].stage_iid);
147                desc->set_name(si_queue_stage_desc[s].name);
148             }
149          }
150       }
151    }
152 
153    device->next_clock_sync_ns = 0;
154    sync_timestamp(ctx, device);
155 }
156 
157 typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *,
158                                             const void*);
159 
begin_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id)160 static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id)
161 {
162    PERFETTO_LOG("begin event called - ts_ns=%" PRIu64, ts_ns);
163    uint32_t level = queue->stages[stage_id].level;
164    /* If we haven't managed to calibrate the alignment between GPU and CPU
165     * timestamps yet, then skip this trace, otherwise perfetto won't know
166     * what to do with it.
167     */
168    if (!queue->device->sync_gpu_ts) {
169       queue->stages[stage_id].start_ns[level] = 0;
170       return;
171    }
172 
173    if (level >= (ARRAY_SIZE(queue->stages[stage_id].start_ns) - 1))
174       return;
175 
176    queue->stages[stage_id].start_ns[level] = ts_ns;
177    queue->stages[stage_id].level++;
178 }
179 
end_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id,uint32_t submission_id,const char * app_event,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)180 static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id,
181                       uint32_t submission_id, const char *app_event, const void* payload = nullptr,
182                       trace_payload_as_extra_func payload_as_extra = nullptr)
183 {
184    PERFETTO_LOG("end event called - ts_ns=%" PRIu64, ts_ns);
185    struct si_ds_device *device = queue->device;
186 
187    /* If we haven't managed to calibrate the alignment between GPU and CPU
188     * timestamps yet, then skip this trace, otherwise perfetto won't know
189     * what to do with it.
190     */
191    if (!device->sync_gpu_ts)
192       return;
193 
194    if (queue->stages[stage_id].level == 0)
195       return;
196 
197    uint32_t level = --queue->stages[stage_id].level;
198    struct si_ds_stage *stage = &queue->stages[stage_id];
199    uint64_t start_ns = stage->start_ns[level];
200    PERFETTO_LOG("end event called - start_ns=%" PRIu64 " ts_ns=%" PRIu64, start_ns, ts_ns);
201    if (!start_ns || start_ns > ts_ns)
202       return;
203 
204    SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
205       if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
206          send_descriptors(tctx, queue->device);
207          state->was_cleared = false;
208       }
209 
210       sync_timestamp(tctx, queue->device);
211 
212       uint64_t evt_id = device->event_id++;
213 
214       /* If this is an application event, we might need to generate a new
215        * stage_iid if not already seen. Otherwise, it's a driver event and we
216        * have use the internal stage_iid.
217        */
218       uint64_t stage_iid = app_event ?
219                            tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) :
220                            stage->stage_iid;
221 
222       auto packet = tctx.NewTracePacket();
223 
224       packet->set_timestamp(start_ns);
225       packet->set_timestamp_clock_id(queue->device->gpu_clock_id);
226 
227       assert(ts_ns >= start_ns);
228 
229       auto event = packet->set_gpu_render_stage_event();
230       event->set_gpu_id(queue->device->gpu_id);
231 
232       event->set_hw_queue_iid(stage->queue_iid);
233       event->set_stage_iid(stage_iid);
234       event->set_context(queue->device->iid);
235       event->set_event_id(evt_id);
236       event->set_duration(ts_ns - start_ns);
237       event->set_submission_id(submission_id);
238 
239       if (payload && payload_as_extra) {
240          payload_as_extra(event, payload);
241       }
242    });
243 
244    stage->start_ns[level] = 0;
245 }
246 
247 #endif /* HAVE_PERFETTO */
248 
249 #ifdef __cplusplus
250 extern "C" {
251 #endif
252 
253 #ifdef HAVE_PERFETTO
254 
255 /*
256  * Trace callbacks, called from u_trace once the timestamps from GPU have been
257  * collected.
258  */
259 
260 #define CREATE_DUAL_EVENT_CALLBACK(event_name, stage)                                             \
261 void si_ds_begin_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx,       \
262                               const void *flush_data,                                             \
263                               const struct trace_si_begin_##event_name *payload)                  \
264 {                                                                                                 \
265    const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data;           \
266    begin_event(flush->queue, ts_ns, stage);                                                       \
267 }                                                                                                 \
268                                                                                                   \
269 void si_ds_end_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx,         \
270                             const void *flush_data,                                               \
271                             const struct trace_si_end_##event_name *payload)                      \
272 {                                                                                                 \
273    const struct si_ds_flush_data *flush =  (const struct si_ds_flush_data *) flush_data;          \
274    end_event(flush->queue, ts_ns, stage, flush->submission_id, NULL, payload,                     \
275              (trace_payload_as_extra_func)&trace_payload_as_extra_si_end_##event_name);           \
276 }                                                                                                 \
277 
CREATE_DUAL_EVENT_CALLBACK(draw,SI_DS_QUEUE_STAGE_DRAW)278 CREATE_DUAL_EVENT_CALLBACK(draw, SI_DS_QUEUE_STAGE_DRAW)
279 CREATE_DUAL_EVENT_CALLBACK(compute, SI_DS_QUEUE_STAGE_COMPUTE)
280 
281 uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
282 {
283    return perfetto::base::GetBootTimeNs().count();
284 }
285 
si_ds_end_submit(struct si_ds_queue * queue,uint64_t start_ts)286 void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
287 {
288    if (!u_trace_should_process(&queue->device->trace_context)) {
289       queue->device->sync_gpu_ts = 0;
290       queue->device->next_clock_sync_ns = 0;
291       return;
292    }
293 
294    uint64_t end_ts = perfetto::base::GetBootTimeNs().count();
295    uint32_t submission_id = queue->submission_id++;
296 
297    SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
298       if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
299          send_descriptors(tctx, queue->device);
300          state->was_cleared = false;
301       }
302 
303       sync_timestamp(tctx, queue->device);
304 
305       auto packet = tctx.NewTracePacket();
306 
307       packet->set_timestamp(start_ts);
308 
309       auto event = packet->set_vulkan_api_event();
310       auto submit = event->set_vk_queue_submit();
311 
312       submit->set_duration_ns(end_ts - start_ts);
313       submit->set_vk_queue((uintptr_t) queue);
314       submit->set_submission_id(submission_id);
315    });
316 }
317 
318 #endif /* HAVE_PERFETTO */
319 
si_driver_ds_init_once(void)320 static void si_driver_ds_init_once(void)
321 {
322 #ifdef HAVE_PERFETTO
323    util_perfetto_init();
324    perfetto::DataSourceDescriptor dsd;
325    dsd.set_name("gpu.renderstages.amd");
326    SIRenderpassDataSource::Register(dsd);
327 #endif
328 }
329 
330 static once_flag si_driver_ds_once_flag = ONCE_FLAG_INIT;
331 static uint64_t iid = 1;
332 
get_iid()333 static uint64_t get_iid()
334 {
335    return iid++;
336 }
337 
si_pps_clock_id(uint32_t gpu_id)338 static uint32_t si_pps_clock_id(uint32_t gpu_id)
339 {
340    char buf[40];
341    snprintf(buf, sizeof(buf), "org.freedesktop.mesa.amd.gpu%u", gpu_id);
342 
343    return _mesa_hash_string(buf) | 0x80000000;
344 }
345 
si_driver_ds_init(void)346 void si_driver_ds_init(void)
347 {
348    call_once(&si_driver_ds_once_flag, si_driver_ds_init_once);
349    si_gpu_tracepoint_config_variable();
350 }
351 
si_ds_device_init(struct si_ds_device * device,const struct radeon_info * devinfo,uint32_t gpu_id,enum amd_ds_api api)352 void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo,
353                        uint32_t gpu_id, enum amd_ds_api api)
354 {
355    device->gpu_id = gpu_id;
356    device->gpu_clock_id = si_pps_clock_id(gpu_id);
357    device->info = devinfo;
358    device->iid = get_iid();
359    device->api = api;
360    list_inithead(&device->queues);
361 }
362 
si_ds_device_fini(struct si_ds_device * device)363 void si_ds_device_fini(struct si_ds_device *device)
364 {
365    u_trace_context_fini(&device->trace_context);
366 }
367 
si_ds_device_init_queue(struct si_ds_device * device,struct si_ds_queue * queue,const char * fmt_name,...)368 struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device,
369                                              struct si_ds_queue *queue,
370                                              const char *fmt_name, ...)
371 {
372    va_list ap;
373    queue->device = device;
374 
375    va_start(ap, fmt_name);
376    vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap);
377    va_end(ap);
378 
379    for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
380       queue->stages[s].queue_iid = get_iid();
381       queue->stages[s].stage_iid = get_iid();
382    }
383 
384    list_add(&queue->link, &device->queues);
385 
386    return queue;
387 }
388 
si_ds_flush_data_init(struct si_ds_flush_data * data,struct si_ds_queue * queue,uint64_t submission_id)389 void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue,
390                            uint64_t submission_id)
391 {
392    memset(data, 0, sizeof(*data));
393 
394    data->queue = queue;
395    data->submission_id = submission_id;
396 
397    u_trace_init(&data->trace, &queue->device->trace_context);
398 }
399 
si_ds_flush_data_fini(struct si_ds_flush_data * data)400 void si_ds_flush_data_fini(struct si_ds_flush_data *data)
401 {
402    u_trace_fini(&data->trace);
403 }
404 
405 #ifdef __cplusplus
406 }
407 #endif
408