1 /*
2 * Copyright 2024 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "perf/xe/intel_perf.h"
7
8 #include <fcntl.h>
9 #include <sys/stat.h>
10
11 #include "perf/intel_perf.h"
12 #include "intel_perf_common.h"
13 #include "intel/common/intel_gem.h"
14 #include "intel/common/xe/intel_device_query.h"
15 #include "intel/common/xe/intel_queue.h"
16
17 #include "drm-uapi/xe_drm.h"
18
19 #define FIELD_PREP_ULL(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask))
20
xe_perf_get_oa_format(struct intel_perf_config * perf)21 uint64_t xe_perf_get_oa_format(struct intel_perf_config *perf)
22 {
23 uint64_t fmt;
24
25 if (perf->devinfo->verx10 >= 200) {
26 /* BSpec: 60942
27 * PEC64u64
28 */
29 fmt = FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_FMT_TYPE, DRM_XE_OA_FMT_TYPE_PEC);
30 fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_COUNTER_SEL, 1);
31 fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_COUNTER_SIZE, 1);
32 fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_BC_REPORT, 0);
33 } else {
34 /* BSpec: 52198
35 * same as I915_OA_FORMAT_A24u40_A14u32_B8_C8 and
36 * I915_OA_FORMAT_A32u40_A4u32_B8_C8 returned for gfx 125+ and gfx 120
37 * respectively.
38 */
39 fmt = FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_FMT_TYPE, DRM_XE_OA_FMT_TYPE_OAG);
40 fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_COUNTER_SEL, 5);
41 fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_COUNTER_SIZE, 0);
42 fmt |= FIELD_PREP_ULL(DRM_XE_OA_FORMAT_MASK_BC_REPORT, 0);
43 }
44
45 return fmt;
46 }
47
48 bool
xe_oa_metrics_available(struct intel_perf_config * perf,int fd,bool use_register_snapshots)49 xe_oa_metrics_available(struct intel_perf_config *perf, int fd, bool use_register_snapshots)
50 {
51 struct drm_xe_query_oa_units *oa_units;
52 bool perf_oa_available = false;
53 struct stat sb;
54
55 /* The existence of this file implies that this Xe KMD version supports
56 * observation interface.
57 */
58 if (stat("/proc/sys/dev/xe/observation_paranoid", &sb) == 0) {
59 uint64_t paranoid = 1;
60
61 /* Now we need to check if application has privileges to access observation
62 * interface.
63 *
64 * TODO: this approach does not takes into account applications running
65 * with CAP_PERFMON privileges.
66 */
67 read_file_uint64("/proc/sys/dev/xe/observation_paranoid", ¶noid);
68 if (paranoid == 0 || geteuid() == 0)
69 perf_oa_available = true;
70 }
71
72 if (!perf_oa_available)
73 return perf_oa_available;
74
75 perf->features_supported |= INTEL_PERF_FEATURE_HOLD_PREEMPTION;
76
77 oa_units = xe_device_query_alloc_fetch(fd, DRM_XE_DEVICE_QUERY_OA_UNITS, NULL);
78 if (oa_units) {
79 uint8_t *poau;
80 uint32_t i;
81
82 poau = (uint8_t *)oa_units->oa_units;
83 for (i = 0; i < oa_units->num_oa_units; i++) {
84 struct drm_xe_oa_unit *oa_unit = (struct drm_xe_oa_unit *)poau;
85 uint32_t engine_i;
86 bool render_found = false;
87
88 for (engine_i = 0; engine_i < oa_unit->num_engines; engine_i++) {
89 if (oa_unit->eci[engine_i].engine_class == DRM_XE_ENGINE_CLASS_RENDER) {
90 render_found = true;
91 break;
92 }
93 }
94
95 if (!render_found)
96 continue;
97
98 if (oa_unit->capabilities & DRM_XE_OA_CAPS_SYNCS) {
99 perf->features_supported |= INTEL_PERF_FEATURE_METRIC_SYNC;
100 break;
101 }
102 poau += sizeof(*oa_unit) + oa_unit->num_engines * sizeof(oa_unit->eci[0]);
103 }
104
105 free(oa_units);
106 }
107
108 return perf_oa_available;
109 }
110
111 uint64_t
xe_add_config(struct intel_perf_config * perf,int fd,const struct intel_perf_registers * config,const char * guid)112 xe_add_config(struct intel_perf_config *perf, int fd,
113 const struct intel_perf_registers *config,
114 const char *guid)
115 {
116 struct drm_xe_oa_config xe_config = {};
117 struct drm_xe_observation_param observation_param = {
118 .observation_type = DRM_XE_OBSERVATION_TYPE_OA,
119 .observation_op = DRM_XE_OBSERVATION_OP_ADD_CONFIG,
120 .param = (uintptr_t)&xe_config,
121 };
122 uint32_t *regs;
123 int ret;
124
125 memcpy(xe_config.uuid, guid, sizeof(xe_config.uuid));
126
127 xe_config.n_regs = config->n_mux_regs + config->n_b_counter_regs + config->n_flex_regs;
128 assert(xe_config.n_regs > 0);
129
130 regs = malloc(sizeof(uint64_t) * xe_config.n_regs);
131 xe_config.regs_ptr = (uintptr_t)regs;
132
133 memcpy(regs, config->mux_regs, config->n_mux_regs * sizeof(uint64_t));
134 regs += 2 * config->n_mux_regs;
135 memcpy(regs, config->b_counter_regs, config->n_b_counter_regs * sizeof(uint64_t));
136 regs += 2 * config->n_b_counter_regs;
137 memcpy(regs, config->flex_regs, config->n_flex_regs * sizeof(uint64_t));
138
139 ret = intel_ioctl(fd, DRM_IOCTL_XE_OBSERVATION, &observation_param);
140 free((void*)(uintptr_t)xe_config.regs_ptr);
141 return ret > 0 ? ret : 0;
142 }
143
144 void
xe_remove_config(struct intel_perf_config * perf,int fd,uint64_t config_id)145 xe_remove_config(struct intel_perf_config *perf, int fd, uint64_t config_id)
146 {
147 struct drm_xe_observation_param observation_param = {
148 .observation_type = DRM_XE_OBSERVATION_TYPE_OA,
149 .observation_op = DRM_XE_OBSERVATION_OP_REMOVE_CONFIG,
150 .param = (uintptr_t)&config_id,
151 };
152
153 intel_ioctl(fd, DRM_IOCTL_XE_OBSERVATION, &observation_param);
154 }
155
156 static void
oa_prop_set(struct drm_xe_ext_set_property * props,uint32_t * index,enum drm_xe_oa_property_id prop_id,uint64_t value)157 oa_prop_set(struct drm_xe_ext_set_property *props, uint32_t *index,
158 enum drm_xe_oa_property_id prop_id, uint64_t value)
159 {
160 if (*index > 0)
161 props[*index - 1].base.next_extension = (uintptr_t)&props[*index];
162
163 props[*index].base.name = DRM_XE_OA_EXTENSION_SET_PROPERTY;
164 props[*index].property = prop_id;
165 props[*index].value = value;
166 *index = *index + 1;
167 }
168
169 int
xe_perf_stream_open(struct intel_perf_config * perf_config,int drm_fd,uint32_t exec_id,uint64_t metrics_set_id,uint64_t report_format,uint64_t period_exponent,bool hold_preemption,bool enable,struct intel_bind_timeline * timeline)170 xe_perf_stream_open(struct intel_perf_config *perf_config, int drm_fd,
171 uint32_t exec_id, uint64_t metrics_set_id,
172 uint64_t report_format, uint64_t period_exponent,
173 bool hold_preemption, bool enable,
174 struct intel_bind_timeline *timeline)
175 {
176 struct drm_xe_ext_set_property props[DRM_XE_OA_PROPERTY_NO_PREEMPT + 1] = {};
177 struct drm_xe_observation_param observation_param = {
178 .observation_type = DRM_XE_OBSERVATION_TYPE_OA,
179 .observation_op = DRM_XE_OBSERVATION_OP_STREAM_OPEN,
180 .param = (uintptr_t)&props,
181 };
182 struct drm_xe_sync sync = {
183 .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
184 .flags = DRM_XE_SYNC_FLAG_SIGNAL,
185 };
186 uint32_t i = 0;
187 int fd, flags;
188
189 if (exec_id)
190 oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_EXEC_QUEUE_ID, exec_id);
191 oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_OA_DISABLED, !enable);
192 oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_SAMPLE_OA, true);
193 oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_OA_METRIC_SET, metrics_set_id);
194 oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_OA_FORMAT, report_format);
195 oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_OA_PERIOD_EXPONENT, period_exponent);
196 if (hold_preemption)
197 oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_NO_PREEMPT, hold_preemption);
198
199 if (timeline && intel_bind_timeline_get_syncobj(timeline)) {
200 oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_NUM_SYNCS, 1);
201 oa_prop_set(props, &i, DRM_XE_OA_PROPERTY_SYNCS, (uintptr_t)&sync);
202
203 sync.handle = intel_bind_timeline_get_syncobj(timeline);
204 sync.timeline_value = intel_bind_timeline_bind_begin(timeline);
205 fd = intel_ioctl(drm_fd, DRM_IOCTL_XE_OBSERVATION, &observation_param);
206 intel_bind_timeline_bind_end(timeline);
207 } else {
208 fd = intel_ioctl(drm_fd, DRM_IOCTL_XE_OBSERVATION, &observation_param);
209 }
210
211 if (fd < 0)
212 return fd;
213
214 flags = fcntl(fd, F_GETFL, 0);
215 flags |= O_CLOEXEC | O_NONBLOCK;
216 if (fcntl(fd, F_SETFL, flags)) {
217 close(fd);
218 return -1;
219 }
220
221 return fd;
222 }
223
224 int
xe_perf_stream_set_state(int perf_stream_fd,bool enable)225 xe_perf_stream_set_state(int perf_stream_fd, bool enable)
226 {
227 unsigned long uapi = enable ? DRM_XE_OBSERVATION_IOCTL_ENABLE :
228 DRM_XE_OBSERVATION_IOCTL_DISABLE;
229
230 return intel_ioctl(perf_stream_fd, uapi, 0);
231 }
232
233 int
xe_perf_stream_set_metrics_id(int perf_stream_fd,int drm_fd,uint32_t exec_queue,uint64_t metrics_set_id,struct intel_bind_timeline * timeline)234 xe_perf_stream_set_metrics_id(int perf_stream_fd, int drm_fd,
235 uint32_t exec_queue, uint64_t metrics_set_id,
236 struct intel_bind_timeline *timeline)
237 {
238 struct drm_xe_ext_set_property prop[3] = {};
239 uint32_t index = 0;
240 int ret;
241
242 oa_prop_set(prop, &index, DRM_XE_OA_PROPERTY_OA_METRIC_SET,
243 metrics_set_id);
244
245 if (timeline && intel_bind_timeline_get_syncobj(timeline)) {
246 struct drm_xe_sync xe_syncs[3] = {};
247 uint32_t syncobj;
248 int ret2;
249
250 oa_prop_set(prop, &index, DRM_XE_OA_PROPERTY_NUM_SYNCS, ARRAY_SIZE(xe_syncs));
251 oa_prop_set(prop, &index, DRM_XE_OA_PROPERTY_SYNCS, (uintptr_t)xe_syncs);
252
253 /* wait on all previous exec in queues */
254 ret = xe_queue_get_syncobj_for_idle(drm_fd, exec_queue, &syncobj);
255 if (ret)
256 return ret;
257 xe_syncs[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
258 xe_syncs[0].flags = 0;/* wait */
259 xe_syncs[0].handle = syncobj;
260
261 /* wait on previous set_metrics_id to complete */
262 xe_syncs[1].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
263 xe_syncs[1].flags = 0;/* wait */
264 xe_syncs[1].handle = intel_bind_timeline_get_syncobj(timeline);
265 xe_syncs[1].timeline_value = intel_bind_timeline_get_last_point(timeline);
266
267 /* signal completion */
268 xe_syncs[2].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
269 xe_syncs[2].flags = DRM_XE_SYNC_FLAG_SIGNAL;
270 xe_syncs[2].handle = intel_bind_timeline_get_syncobj(timeline);
271 xe_syncs[2].timeline_value = intel_bind_timeline_bind_begin(timeline);
272
273 ret = intel_ioctl(perf_stream_fd, DRM_XE_OBSERVATION_IOCTL_CONFIG,
274 (void *)(uintptr_t)&prop);
275 intel_bind_timeline_bind_end(timeline);
276
277 /* Looks safe to destroy as Xe KMD should increase the ref count until
278 * it is using it
279 */
280 struct drm_syncobj_destroy syncobj_destroy = {
281 .handle = syncobj,
282 };
283 ret2 = intel_ioctl(drm_fd, DRM_IOCTL_SYNCOBJ_DESTROY, &syncobj_destroy);
284 assert(ret2 == 0);
285 } else {
286 ret = intel_ioctl(perf_stream_fd, DRM_XE_OBSERVATION_IOCTL_CONFIG,
287 (void *)(uintptr_t)&prop);
288 }
289
290 return ret;
291 }
292
293 static int
xe_perf_stream_read_error(int perf_stream_fd,uint8_t * buffer,size_t buffer_len)294 xe_perf_stream_read_error(int perf_stream_fd, uint8_t *buffer, size_t buffer_len)
295 {
296 struct drm_xe_oa_stream_status status = {};
297 struct intel_perf_record_header *header;
298 int ret;
299
300 ret = intel_ioctl(perf_stream_fd, DRM_XE_OBSERVATION_IOCTL_STATUS, &status);
301 if (ret)
302 return -errno;
303
304 header = (struct intel_perf_record_header *)buffer;
305 header->pad = 0;
306 header->type = 0;
307 header->size = sizeof(*header);
308 ret = header->size;
309
310 if (status.oa_status & INTEL_PERF_RECORD_TYPE_OA_BUFFER_LOST)
311 header->type = INTEL_PERF_RECORD_TYPE_OA_BUFFER_LOST;
312 else if (status.oa_status & DRM_XE_OASTATUS_REPORT_LOST)
313 header->type = INTEL_PERF_RECORD_TYPE_OA_REPORT_LOST;
314 else if (status.oa_status & DRM_XE_OASTATUS_COUNTER_OVERFLOW)
315 header->type = INTEL_PERF_RECORD_TYPE_COUNTER_OVERFLOW;
316 else if (status.oa_status & DRM_XE_OASTATUS_MMIO_TRG_Q_FULL)
317 header->type = INTEL_PERF_RECORD_TYPE_MMIO_TRG_Q_FULL;
318 else
319 unreachable("missing");
320
321 return header->type ? header->size : -1;
322 }
323
324 int
xe_perf_stream_read_samples(struct intel_perf_config * perf_config,int perf_stream_fd,uint8_t * buffer,size_t buffer_len)325 xe_perf_stream_read_samples(struct intel_perf_config *perf_config, int perf_stream_fd,
326 uint8_t *buffer, size_t buffer_len)
327 {
328 const size_t sample_size = perf_config->oa_sample_size;
329 const size_t sample_header_size = sample_size + sizeof(struct intel_perf_record_header);
330 uint32_t num_samples = buffer_len / sample_header_size;
331 const size_t max_bytes_read = num_samples * sample_size;
332 uint8_t *offset, *offset_samples;
333 int len, i;
334
335 if (buffer_len < sample_header_size)
336 return -ENOSPC;
337
338 do {
339 len = read(perf_stream_fd, buffer, max_bytes_read);
340 } while (len < 0 && errno == EINTR);
341
342 if (len <= 0) {
343 if (errno == EIO)
344 return xe_perf_stream_read_error(perf_stream_fd, buffer, buffer_len);
345
346 return len < 0 ? -errno : 0;
347 }
348
349 num_samples = len / sample_size;
350 offset = buffer;
351 offset_samples = buffer + (buffer_len - len);
352 /* move all samples to the end of buffer */
353 memmove(offset_samples, buffer, len);
354
355 /* setup header, then copy sample from the end of buffer */
356 for (i = 0; i < num_samples; i++) {
357 struct intel_perf_record_header *header = (struct intel_perf_record_header *)offset;
358
359 /* TODO: also append REPORT_LOST and BUFFER_LOST */
360 header->type = INTEL_PERF_RECORD_TYPE_SAMPLE;
361 header->pad = 0;
362 header->size = sample_header_size;
363 offset += sizeof(*header);
364
365 memmove(offset, offset_samples, sample_size);
366 offset += sample_size;
367 offset_samples += sample_size;
368 }
369
370 return offset - buffer;
371 }
372