• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #ifndef _U_TRACE_H
25 #define _U_TRACE_H
26 
27 #include <stdbool.h>
28 #include <stdint.h>
29 #include <stdio.h>
30 
31 #include "util/u_queue.h"
32 
33 #ifdef __cplusplus
34 extern "C" {
35 #endif
36 
37 /* A trace mechanism (very) loosely inspired by the linux kernel tracepoint
38  * mechanism, in that it allows for defining driver specific (or common)
39  * tracepoints, which generate 'trace_$name()' functions that can be
40  * called at various points in commandstream emit.
41  *
42  * Currently a printf backend is implemented, but the expectation is to
43  * also implement a perfetto backend for shipping out traces to a tool like
44  * AGI.
45  *
46  * Notable differences:
47  *
48  *  - GPU timestamps!  A driver provided callback is used to emit timestamps
49  *    to a buffer.  At a later point in time (when stalling to wait for the
50  *    GPU is not required), the timestamps are re-united with the trace
51  *    payload.  This makes the trace mechanism suitable for profiling.
52  *
53  *  - Instead of a systemwide trace ringbuffer, buffering of un-retired
54  *    tracepoints is split into two stages.  Traces are emitted to a
55  *    'u_trace' instance, and at a later time flushed to a 'u_trace_context'
56  *    instance.  This avoids the requirement that commandstream containing
57  *    tracepoints is emitted in the same order as it is generated.
58  *
59  *    If the hw has multiple parallel "engines" (for example, 3d/blit/compute)
60  *    then a `u_trace_context` per-engine should be used.
61  *
62  *  - Unlike kernel tracepoints, u_trace tracepoints are defined in py
63  *    from which header and src files are generated.  Since we already have
64  *    a build dependency on python+mako, this gives more flexibility than
65  *    clunky preprocessor macro magic.
66  *
67  */
68 
69 struct u_trace_context;
70 struct u_trace;
71 struct u_trace_chunk;
72 struct u_trace_printer;
73 
74 /**
75  * Special reserved value to indicate that no timestamp was captured,
76  * and that the timestamp of the previous trace should be reused.
77  */
78 #define U_TRACE_NO_TIMESTAMP ((uint64_t)0)
79 
80 /**
81  * Driver provided callback to create a timestamp buffer which will be
82  * read by u_trace_read_ts function.
83  */
84 typedef void* (*u_trace_create_ts_buffer)(struct u_trace_context *utctx,
85       uint32_t timestamps_count);
86 
87 /**
88  * Driver provided callback to delete a timestamp buffer.
89  */
90 typedef void (*u_trace_delete_ts_buffer)(struct u_trace_context *utctx,
91       void *timestamps);
92 
93 /**
94  * Driver provided callback to emit commands into the soecified command
95  * stream to capture a 64b timestamp into the specified timestamps buffer,
96  * at the specified index.
97  *
98  * The hw counter that the driver records should be something that runs at
99  * a fixed rate, even as the GPU freq changes.  The same source used for
100  * GL_TIMESTAMP queries should be appropriate.
101  */
102 typedef void (*u_trace_record_ts)(struct u_trace *ut, void *cs,
103                                   void *timestamps, unsigned idx,
104                                   bool end_of_pipe);
105 
106 /**
107  * Driver provided callback to read back a previously recorded timestamp.
108  * If necessary, this should block until the GPU has finished writing back
109  * the timestamps.  (The timestamps will be read back in order, so it is
110  * safe to only synchronize on idx==0.)
111  *
112  * flush_data is data provided by the driver via u_trace_flush.
113  *
114  * The returned timestamp should be in units of nanoseconds.  The same
115  * timebase as GL_TIMESTAMP queries should be used.
116  *
117  * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate
118  * that no timestamp was captured and the timestamp from the previous trace
119  * will be re-used.  (The first trace in the u_trace buf may not do this.)
120  * This allows the driver to detect cases where multiple tracepoints are
121  * emitted with no other intervening cmdstream, to avoid pointlessly
122  * capturing the same timestamp multiple times in a row.
123  */
124 typedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx,
125       void *timestamps, unsigned idx, void *flush_data);
126 
127 /**
128  * Driver provided callback to delete flush data.
129  */
130 typedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx,
131       void *flush_data);
132 
133 /**
134  * The trace context provides tracking for "in-flight" traces, once the
135  * cmdstream that records timestamps has been flushed.
136  */
137 struct u_trace_context {
138    void *pctx;
139 
140    u_trace_create_ts_buffer  create_timestamp_buffer;
141    u_trace_delete_ts_buffer  delete_timestamp_buffer;
142    u_trace_record_ts         record_timestamp;
143    u_trace_read_ts           read_timestamp;
144    u_trace_delete_flush_data delete_flush_data;
145 
146    FILE *out;
147    struct u_trace_printer *out_printer;
148 
149    /* Once u_trace_flush() is called u_trace_chunk's are queued up to
150     * render tracepoints on a queue.  The per-chunk queue jobs block until
151     * timestamps are available.
152     */
153    struct util_queue queue;
154 
155 #ifdef HAVE_PERFETTO
156    /* node in global list of trace contexts. */
157    struct list_head node;
158 #endif
159 
160    /* State to accumulate time across N chunks associated with a single
161     * batch (u_trace).
162     */
163    uint64_t last_time_ns;
164    uint64_t first_time_ns;
165 
166    uint32_t frame_nr;
167    uint32_t batch_nr;
168    uint32_t event_nr;
169    bool start_of_frame;
170 
171    /* list of unprocessed trace chunks in fifo order: */
172    struct list_head flushed_trace_chunks;
173 };
174 
175 /**
176  * The u_trace ptr is passed as the first arg to generated tracepoints.
177  * It provides buffering for tracepoint payload until the corresponding
178  * driver cmdstream containing the emitted commands to capture is
179  * flushed.
180  *
181  * Individual tracepoints emitted to u_trace are expected to be "executed"
182  * (ie. timestamp captured) in FIFO order with respect to other tracepoints
183  * emitted to the same u_trace.  But the order WRT other u_trace instances
184  * is undefined util u_trace_flush().
185  */
186 struct u_trace {
187    struct u_trace_context *utctx;
188 
189    struct list_head trace_chunks;  /* list of unflushed trace chunks in fifo order */
190 
191    bool enabled;
192 };
193 
194 void u_trace_context_init(struct u_trace_context *utctx,
195       void *pctx,
196       u_trace_create_ts_buffer   create_timestamp_buffer,
197       u_trace_delete_ts_buffer   delete_timestamp_buffer,
198       u_trace_record_ts          record_timestamp,
199       u_trace_read_ts            read_timestamp,
200       u_trace_delete_flush_data  delete_flush_data);
201 void u_trace_context_fini(struct u_trace_context *utctx);
202 
203 /**
204  * Flush (trigger processing) of traces previously flushed to the trace-context
205  * by u_trace_flush().
206  *
207  * This should typically be called in the driver's pctx->flush().
208  */
209 void u_trace_context_process(struct u_trace_context *utctx, bool eof);
210 
211 void u_trace_init(struct u_trace *ut, struct u_trace_context *utctx);
212 void u_trace_fini(struct u_trace *ut);
213 
214 bool u_trace_has_points(struct u_trace *ut);
215 
216 struct u_trace_iterator
217 {
218    struct u_trace *ut;
219    struct u_trace_chunk *chunk;
220    uint32_t event_idx;
221 };
222 
223 struct u_trace_iterator
224 u_trace_begin_iterator(struct u_trace *ut);
225 
226 struct u_trace_iterator
227 u_trace_end_iterator(struct u_trace *ut);
228 
229 bool
230 u_trace_iterator_equal(struct u_trace_iterator a,
231                        struct u_trace_iterator b);
232 
233 typedef void (*u_trace_copy_ts_buffer)(struct u_trace_context *utctx,
234       void *cmdstream,
235       void *ts_from, uint32_t from_offset,
236       void *ts_to, uint32_t to_offset,
237       uint32_t count);
238 
239 /**
240  * Clones tracepoints range into target u_trace.
241  * Provides callback for driver to copy timestamps on GPU from
242  * one buffer to another.
243  *
244  * It allows:
245  * - Tracing re-usable command buffer in Vulkan, by copying tracepoints
246  *   each time it is submitted.
247  * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints
248  *   corresponding to a tile.
249  */
250 void u_trace_clone_append(struct u_trace_iterator begin_it,
251                           struct u_trace_iterator end_it,
252                           struct u_trace *into,
253                           void *cmdstream,
254                           u_trace_copy_ts_buffer copy_ts_buffer);
255 
256 void u_trace_disable_event_range(struct u_trace_iterator begin_it,
257                                  struct u_trace_iterator end_it);
258 
259 /**
260  * Flush traces to the parent trace-context.  At this point, the expectation
261  * is that all the tracepoints are "executed" by the GPU following any previously
262  * flushed u_trace batch.
263  *
264  * flush_data is a way for driver to pass additional data, which becomes available
265  * only at the point of flush, to the u_trace_read_ts callback and perfetto.
266  * The typical example of such data would be a fence to wait on in u_trace_read_ts,
267  * and a submission_id to pass into perfetto.
268  * The destruction of the data is done via u_trace_delete_flush_data.
269  *
270  * This should typically be called when the corresponding cmdstream (containing
271  * the timestamp reads) is flushed to the kernel.
272  */
273 void u_trace_flush(struct u_trace *ut, void *flush_data, bool free_data);
274 
275 /**
276  * Whether command buffers should be instrumented even if not collecting
277  * traces.
278  */
279 extern bool ut_trace_instrument;
280 
281 #ifdef HAVE_PERFETTO
282 extern int ut_perfetto_enabled;
283 
284 void u_trace_perfetto_start(void);
285 void u_trace_perfetto_stop(void);
286 #else
287 #  define ut_perfetto_enabled 0
288 #endif
289 
290 static inline bool
u_trace_context_actively_tracing(struct u_trace_context * utctx)291 u_trace_context_actively_tracing(struct u_trace_context *utctx)
292 {
293    return !!utctx->out || (ut_perfetto_enabled > 0);
294 }
295 
296 static inline bool
u_trace_context_instrumenting(struct u_trace_context * utctx)297 u_trace_context_instrumenting(struct u_trace_context *utctx)
298 {
299    return !!utctx->out || ut_trace_instrument || (ut_perfetto_enabled > 0);
300 }
301 
302 #ifdef __cplusplus
303 }
304 #endif
305 
306 #endif  /* _U_TRACE_H */
307