• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 #include <stdio.h>
7 #include <stdarg.h>
8 #include <string.h>
9 
10 #include "util/hash_table.h"
11 #include "util/u_process.h"
12 #include "util/hash_table.h"
13 
14 #include "si_pipe.h"
15 #include "si_perfetto.h"
16 #include "si_tracepoints.h"
17 
18 #ifdef HAVE_PERFETTO
19 
20 #include "util/perf/u_perfetto.h"
21 #include "util/perf/u_perfetto_renderpass.h"
22 
23 #include "si_tracepoints_perfetto.h"
24 
25 /* Just naming stages */
26 static const struct {
27    const char *name;
28 
29    /* The perfetto UI requires that there is a parent-child relationship
30     * within a row of elements. Which means that all children elements must
31     * end within the lifespan of their parent.
32     *
33     * Some elements like stalls and command buffers follow that relationship,
34     * but not all. This tells us in which UI row the elements should live.
35     */
36    enum si_ds_queue_stage draw_stage;
37 } si_queue_stage_desc[SI_DS_QUEUE_STAGE_N_STAGES] = {
38    /* Order must match the enum! */
39    {
40       "queue",
41       SI_DS_QUEUE_STAGE_QUEUE,
42    },
43    {
44       "compute",
45       SI_DS_QUEUE_STAGE_COMPUTE,
46    },
47    {
48       "draw",
49       SI_DS_QUEUE_STAGE_DRAW,
50    }
51 };
52 
53 struct SIRenderpassIncrementalState {
54    bool was_cleared = true;
55 };
56 
57 struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits {
58    using IncrementalStateType = SIRenderpassIncrementalState;
59 };
60 
61 class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource,
62                                                                SIRenderpassTraits> {
63 };
64 
65 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
66 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
67 
68 using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory;
69 
sync_timestamp(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)70 static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
71 {
72    uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
73    uint64_t gpu_ts;
74 
75    struct si_context *sctx = container_of(device, struct si_context, ds);
76    gpu_ts = sctx->screen->b.get_timestamp(&sctx->screen->b);
77 
78 
79    cpu_ts = perfetto::base::GetBootTimeNs().count();
80 
81    if (cpu_ts < device->next_clock_sync_ns)
82       return;
83 
84    PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id);
85 
86    device->sync_gpu_ts = gpu_ts;
87    device->next_clock_sync_ns = cpu_ts + 1000000000ull;
88    MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::
89       EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
90 }
91 
send_descriptors(SIRenderpassDataSource::TraceContext & ctx,struct si_ds_device * device)92 static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx,
93                              struct si_ds_device *device)
94 {
95    PERFETTO_LOG("Sending renderstage descriptors");
96 
97    device->event_id = 0;
98    list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
99       for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) {
100          queue->stages[s].start_ns[0] = 0;
101       }
102    }
103 
104    {
105       auto packet = ctx.NewTracePacket();
106 
107       packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
108       packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
109       packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
110 
111       auto interned_data = packet->set_interned_data();
112 
113       {
114          auto desc = interned_data->add_graphics_contexts();
115          desc->set_iid(device->iid);
116          desc->set_pid(getpid());
117          switch (device->api) {
118          case AMD_DS_API_OPENGL:
119             desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::OPEN_GL);
120             break;
121          case AMD_DS_API_VULKAN:
122             desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN);
123             break;
124          default:
125             break;
126          }
127       }
128 
129       /* Emit all the IID picked at device/queue creation. */
130       list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
131          for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
132             {
133                /* We put the stage number in there so that all rows are order
134                 * by si_ds_queue_stage.
135                 */
136                char name[100];
137                snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(),
138                         queue->name, s, si_queue_stage_desc[s].name);
139 
140                auto desc = interned_data->add_gpu_specifications();
141                desc->set_iid(queue->stages[s].queue_iid);
142                desc->set_name(name);
143             }
144             {
145                auto desc = interned_data->add_gpu_specifications();
146                desc->set_iid(queue->stages[s].stage_iid);
147                desc->set_name(si_queue_stage_desc[s].name);
148             }
149          }
150       }
151    }
152 
153    device->next_clock_sync_ns = 0;
154    sync_timestamp(ctx, device);
155 }
156 
begin_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id)157 static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id)
158 {
159    PERFETTO_LOG("begin event called - ts_ns=%" PRIu64, ts_ns);
160    uint32_t level = queue->stages[stage_id].level;
161    /* If we haven't managed to calibrate the alignment between GPU and CPU
162     * timestamps yet, then skip this trace, otherwise perfetto won't know
163     * what to do with it.
164     */
165    if (!queue->device->sync_gpu_ts) {
166       queue->stages[stage_id].start_ns[level] = 0;
167       return;
168    }
169 
170    if (level >= (ARRAY_SIZE(queue->stages[stage_id].start_ns) - 1))
171       return;
172 
173    queue->stages[stage_id].start_ns[level] = ts_ns;
174    queue->stages[stage_id].level++;
175 }
176 
end_event(struct si_ds_queue * queue,uint64_t ts_ns,enum si_ds_queue_stage stage_id,uint32_t submission_id,const char * app_event,const void * payload=nullptr,trace_payload_as_extra_func payload_as_extra=nullptr)177 static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id,
178                       uint32_t submission_id, const char *app_event, const void* payload = nullptr,
179                       trace_payload_as_extra_func payload_as_extra = nullptr)
180 {
181    PERFETTO_LOG("end event called - ts_ns=%" PRIu64, ts_ns);
182    struct si_ds_device *device = queue->device;
183 
184    /* If we haven't managed to calibrate the alignment between GPU and CPU
185     * timestamps yet, then skip this trace, otherwise perfetto won't know
186     * what to do with it.
187     */
188    if (!device->sync_gpu_ts)
189       return;
190 
191    if (queue->stages[stage_id].level == 0)
192       return;
193 
194    uint32_t level = --queue->stages[stage_id].level;
195    struct si_ds_stage *stage = &queue->stages[stage_id];
196    uint64_t start_ns = stage->start_ns[level];
197    PERFETTO_LOG("end event called - start_ns=%" PRIu64 " ts_ns=%" PRIu64, start_ns, ts_ns);
198    if (!start_ns || start_ns > ts_ns)
199       return;
200 
201    SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
202       if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
203          send_descriptors(tctx, queue->device);
204          state->was_cleared = false;
205       }
206 
207       sync_timestamp(tctx, queue->device);
208 
209       uint64_t evt_id = device->event_id++;
210 
211       /* If this is an application event, we might need to generate a new
212        * stage_iid if not already seen. Otherwise, it's a driver event and we
213        * have use the internal stage_iid.
214        */
215       uint64_t stage_iid = app_event ?
216                            tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) :
217                            stage->stage_iid;
218 
219       auto packet = tctx.NewTracePacket();
220 
221       packet->set_timestamp(start_ns);
222       packet->set_timestamp_clock_id(queue->device->gpu_clock_id);
223 
224       assert(ts_ns >= start_ns);
225 
226       auto event = packet->set_gpu_render_stage_event();
227       event->set_gpu_id(queue->device->gpu_id);
228 
229       event->set_hw_queue_iid(stage->queue_iid);
230       event->set_stage_iid(stage_iid);
231       event->set_context(queue->device->iid);
232       event->set_event_id(evt_id);
233       event->set_duration(ts_ns - start_ns);
234       event->set_submission_id(submission_id);
235 
236       if (payload && payload_as_extra) {
237          payload_as_extra(event, payload, nullptr);
238       }
239    });
240 
241    stage->start_ns[level] = 0;
242 }
243 
244 #endif /* HAVE_PERFETTO */
245 
246 #ifdef __cplusplus
247 extern "C" {
248 #endif
249 
250 #ifdef HAVE_PERFETTO
251 
252 /*
253  * Trace callbacks, called from u_trace once the timestamps from GPU have been
254  * collected.
255  */
256 
257 #define CREATE_DUAL_EVENT_CALLBACK(event_name, stage)                                             \
258 void si_ds_begin_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx,       \
259                               const void *flush_data,                                             \
260                               const struct trace_si_begin_##event_name *payload,                  \
261                               const void *indirect_data)                                          \
262 {                                                                                                 \
263    const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data;           \
264    begin_event(flush->queue, ts_ns, stage);                                                       \
265 }                                                                                                 \
266                                                                                                   \
267 void si_ds_end_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx,         \
268                             const void *flush_data,                                               \
269                             const struct trace_si_end_##event_name *payload,                      \
270                             const void *indirect_data)                                            \
271 {                                                                                                 \
272    const struct si_ds_flush_data *flush =  (const struct si_ds_flush_data *) flush_data;          \
273    end_event(flush->queue, ts_ns, stage, flush->submission_id, NULL, payload,                     \
274              (trace_payload_as_extra_func)&trace_payload_as_extra_si_end_##event_name);           \
275 }                                                                                                 \
276 
CREATE_DUAL_EVENT_CALLBACK(draw,SI_DS_QUEUE_STAGE_DRAW)277 CREATE_DUAL_EVENT_CALLBACK(draw, SI_DS_QUEUE_STAGE_DRAW)
278 CREATE_DUAL_EVENT_CALLBACK(compute, SI_DS_QUEUE_STAGE_COMPUTE)
279 
280 uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
281 {
282    return perfetto::base::GetBootTimeNs().count();
283 }
284 
si_ds_end_submit(struct si_ds_queue * queue,uint64_t start_ts)285 void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
286 {
287    if (!u_trace_should_process(&queue->device->trace_context)) {
288       queue->device->sync_gpu_ts = 0;
289       queue->device->next_clock_sync_ns = 0;
290       return;
291    }
292 
293    uint64_t end_ts = perfetto::base::GetBootTimeNs().count();
294    uint32_t submission_id = queue->submission_id++;
295 
296    SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
297       if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
298          send_descriptors(tctx, queue->device);
299          state->was_cleared = false;
300       }
301 
302       sync_timestamp(tctx, queue->device);
303 
304       auto packet = tctx.NewTracePacket();
305 
306       packet->set_timestamp(start_ts);
307 
308       auto event = packet->set_vulkan_api_event();
309       auto submit = event->set_vk_queue_submit();
310 
311       submit->set_duration_ns(end_ts - start_ts);
312       submit->set_vk_queue((uintptr_t) queue);
313       submit->set_submission_id(submission_id);
314    });
315 }
316 
317 #endif /* HAVE_PERFETTO */
318 
si_driver_ds_init_once(void)319 static void si_driver_ds_init_once(void)
320 {
321 #ifdef HAVE_PERFETTO
322    util_perfetto_init();
323    perfetto::DataSourceDescriptor dsd;
324    dsd.set_name("gpu.renderstages.amd");
325    SIRenderpassDataSource::Register(dsd);
326 #endif
327 }
328 
329 static once_flag si_driver_ds_once_flag = ONCE_FLAG_INIT;
330 static uint64_t iid = 1;
331 
get_iid()332 static uint64_t get_iid()
333 {
334    return iid++;
335 }
336 
si_pps_clock_id(uint32_t gpu_id)337 static uint32_t si_pps_clock_id(uint32_t gpu_id)
338 {
339    char buf[40];
340    snprintf(buf, sizeof(buf), "org.freedesktop.mesa.amd.gpu%u", gpu_id);
341 
342    return _mesa_hash_string(buf) | 0x80000000;
343 }
344 
si_driver_ds_init(void)345 void si_driver_ds_init(void)
346 {
347    call_once(&si_driver_ds_once_flag, si_driver_ds_init_once);
348    si_gpu_tracepoint_config_variable();
349 }
350 
si_ds_device_init(struct si_ds_device * device,const struct radeon_info * devinfo,uint32_t gpu_id,enum amd_ds_api api)351 void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo,
352                        uint32_t gpu_id, enum amd_ds_api api)
353 {
354    device->gpu_id = gpu_id;
355    device->gpu_clock_id = si_pps_clock_id(gpu_id);
356    device->info = devinfo;
357    device->iid = get_iid();
358    device->api = api;
359    list_inithead(&device->queues);
360 }
361 
si_ds_device_fini(struct si_ds_device * device)362 void si_ds_device_fini(struct si_ds_device *device)
363 {
364    u_trace_context_fini(&device->trace_context);
365 }
366 
si_ds_device_init_queue(struct si_ds_device * device,struct si_ds_queue * queue,const char * fmt_name,...)367 struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device,
368                                              struct si_ds_queue *queue,
369                                              const char *fmt_name, ...)
370 {
371    va_list ap;
372    queue->device = device;
373 
374    va_start(ap, fmt_name);
375    vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap);
376    va_end(ap);
377 
378    for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
379       queue->stages[s].queue_iid = get_iid();
380       queue->stages[s].stage_iid = get_iid();
381    }
382 
383    list_add(&queue->link, &device->queues);
384 
385    return queue;
386 }
387 
si_ds_flush_data_init(struct si_ds_flush_data * data,struct si_ds_queue * queue,uint64_t submission_id)388 void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue,
389                            uint64_t submission_id)
390 {
391    memset(data, 0, sizeof(*data));
392 
393    data->queue = queue;
394    data->submission_id = submission_id;
395 
396    u_trace_init(&data->trace, &queue->device->trace_context);
397 }
398 
si_ds_flush_data_fini(struct si_ds_flush_data * data)399 void si_ds_flush_data_fini(struct si_ds_flush_data *data)
400 {
401    u_trace_fini(&data->trace);
402 }
403 
404 #ifdef __cplusplus
405 }
406 #endif
407