• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <unistd.h>
25 #include <poll.h>
26 
27 #include "common/intel_gem.h"
28 
29 #include "dev/intel_debug.h"
30 #include "dev/intel_device_info.h"
31 
32 #include "perf/intel_perf.h"
33 #include "perf/intel_perf_mdapi.h"
34 #include "perf/intel_perf_private.h"
35 #include "perf/intel_perf_query.h"
36 #include "perf/intel_perf_regs.h"
37 
38 #include "drm-uapi/i915_drm.h"
39 
40 #include "util/compiler.h"
41 #include "util/u_math.h"
42 
43 #define FILE_DEBUG_FLAG DEBUG_PERFMON
44 
45 #define MI_RPC_BO_SIZE                (4096)
46 #define MI_FREQ_OFFSET_BYTES          (256)
47 #define MI_PERF_COUNTERS_OFFSET_BYTES (260)
48 
49 #define ALIGN(x, y) (((x) + (y)-1) & ~((y)-1))
50 
51 #define MAP_READ  (1 << 0)
52 #define MAP_WRITE (1 << 1)
53 
54 /**
55  * Periodic OA samples are read() into these buffer structures via the
56  * i915 perf kernel interface and appended to the
57  * perf_ctx->sample_buffers linked list. When we process the
58  * results of an OA metrics query we need to consider all the periodic
59  * samples between the Begin and End MI_REPORT_PERF_COUNT command
60  * markers.
61  *
62  * 'Periodic' is a simplification as there are other automatic reports
63  * written by the hardware also buffered here.
64  *
65  * Considering three queries, A, B and C:
66  *
67  *  Time ---->
68  *                ________________A_________________
69  *                |                                |
70  *                | ________B_________ _____C___________
71  *                | |                | |           |   |
72  *
73  * And an illustration of sample buffers read over this time frame:
74  * [HEAD ][     ][     ][     ][     ][     ][     ][     ][TAIL ]
75  *
76  * These nodes may hold samples for query A:
77  * [     ][     ][  A  ][  A  ][  A  ][  A  ][  A  ][     ][     ]
78  *
79  * These nodes may hold samples for query B:
80  * [     ][     ][  B  ][  B  ][  B  ][     ][     ][     ][     ]
81  *
82  * These nodes may hold samples for query C:
83  * [     ][     ][     ][     ][     ][  C  ][  C  ][  C  ][     ]
84  *
85  * The illustration assumes we have an even distribution of periodic
86  * samples so all nodes have the same size plotted against time:
87  *
88  * Note, to simplify code, the list is never empty.
89  *
90  * With overlapping queries we can see that periodic OA reports may
91  * relate to multiple queries and care needs to be take to keep
92  * track of sample buffers until there are no queries that might
93  * depend on their contents.
94  *
95  * We use a node ref counting system where a reference ensures that a
96  * node and all following nodes can't be freed/recycled until the
97  * reference drops to zero.
98  *
99  * E.g. with a ref of one here:
100  * [  0  ][  0  ][  1  ][  0  ][  0  ][  0  ][  0  ][  0  ][  0  ]
101  *
102  * These nodes could be freed or recycled ("reaped"):
103  * [  0  ][  0  ]
104  *
105  * These must be preserved until the leading ref drops to zero:
106  *               [  1  ][  0  ][  0  ][  0  ][  0  ][  0  ][  0  ]
107  *
108  * When a query starts we take a reference on the current tail of
109  * the list, knowing that no already-buffered samples can possibly
110  * relate to the newly-started query. A pointer to this node is
111  * also saved in the query object's ->oa.samples_head.
112  *
113  * E.g. starting query A while there are two nodes in .sample_buffers:
114  *                ________________A________
115  *                |
116  *
117  * [  0  ][  1  ]
118  *           ^_______ Add a reference and store pointer to node in
119  *                    A->oa.samples_head
120  *
121  * Moving forward to when the B query starts with no new buffer nodes:
122  * (for reference, i915 perf reads() are only done when queries finish)
123  *                ________________A_______
124  *                | ________B___
125  *                | |
126  *
127  * [  0  ][  2  ]
128  *           ^_______ Add a reference and store pointer to
129  *                    node in B->oa.samples_head
130  *
131  * Once a query is finished, after an OA query has become 'Ready',
132  * once the End OA report has landed and after we we have processed
133  * all the intermediate periodic samples then we drop the
134  * ->oa.samples_head reference we took at the start.
135  *
136  * So when the B query has finished we have:
137  *                ________________A________
138  *                | ______B___________
139  *                | |                |
140  * [  0  ][  1  ][  0  ][  0  ][  0  ]
141  *           ^_______ Drop B->oa.samples_head reference
142  *
143  * We still can't free these due to the A->oa.samples_head ref:
144  *        [  1  ][  0  ][  0  ][  0  ]
145  *
146  * When the A query finishes: (note there's a new ref for C's samples_head)
147  *                ________________A_________________
148  *                |                                |
149  *                |                    _____C_________
150  *                |                    |           |
151  * [  0  ][  0  ][  0  ][  0  ][  1  ][  0  ][  0  ]
152  *           ^_______ Drop A->oa.samples_head reference
153  *
154  * And we can now reap these nodes up to the C->oa.samples_head:
155  * [  X  ][  X  ][  X  ][  X  ]
156  *                  keeping -> [  1  ][  0  ][  0  ]
157  *
158  * We reap old sample buffers each time we finish processing an OA
159  * query by iterating the sample_buffers list from the head until we
160  * find a referenced node and stop.
161  *
162  * Reaped buffers move to a perfquery.free_sample_buffers list and
163  * when we come to read() we first look to recycle a buffer from the
164  * free_sample_buffers list before allocating a new buffer.
165  */
166 struct oa_sample_buf {
167    struct exec_node link;
168    int refcount;
169    int len;
170    uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
171    uint32_t last_timestamp;
172 };
173 
174 /**
175  * gen representation of a performance query object.
176  *
177  * NB: We want to keep this structure relatively lean considering that
178  * applications may expect to allocate enough objects to be able to
179  * query around all draw calls in a frame.
180  */
181 struct intel_perf_query_object
182 {
183    const struct intel_perf_query_info *queryinfo;
184 
185    /* See query->kind to know which state below is in use... */
186    union {
187       struct {
188 
189          /**
190           * BO containing OA counter snapshots at query Begin/End time.
191           */
192          void *bo;
193 
194          /**
195           * Address of mapped of @bo
196           */
197          void *map;
198 
199          /**
200           * The MI_REPORT_PERF_COUNT command lets us specify a unique
201           * ID that will be reflected in the resulting OA report
202           * that's written by the GPU. This is the ID we're expecting
203           * in the begin report and the the end report should be
204           * @begin_report_id + 1.
205           */
206          int begin_report_id;
207 
208          /**
209           * Reference the head of the brw->perfquery.sample_buffers
210           * list at the time that the query started (so we only need
211           * to look at nodes after this point when looking for samples
212           * related to this query)
213           *
214           * (See struct brw_oa_sample_buf description for more details)
215           */
216          struct exec_node *samples_head;
217 
218          /**
219           * false while in the unaccumulated_elements list, and set to
220           * true when the final, end MI_RPC snapshot has been
221           * accumulated.
222           */
223          bool results_accumulated;
224 
225          /**
226           * Accumulated OA results between begin and end of the query.
227           */
228          struct intel_perf_query_result result;
229       } oa;
230 
231       struct {
232          /**
233           * BO containing starting and ending snapshots for the
234           * statistics counters.
235           */
236          void *bo;
237       } pipeline_stats;
238    };
239 };
240 
241 struct intel_perf_context {
242    struct intel_perf_config *perf;
243 
244    void * mem_ctx; /* ralloc context */
245    void * ctx;  /* driver context (eg, brw_context) */
246    void * bufmgr;
247    const struct intel_device_info *devinfo;
248 
249    uint32_t hw_ctx;
250    int drm_fd;
251 
252    /* The i915 perf stream we open to setup + enable the OA counters */
253    int oa_stream_fd;
254 
255    /* An i915 perf stream fd gives exclusive access to the OA unit that will
256     * report counter snapshots for a specific counter set/profile in a
257     * specific layout/format so we can only start OA queries that are
258     * compatible with the currently open fd...
259     */
260    int current_oa_metrics_set_id;
261    int current_oa_format;
262 
263    /* List of buffers containing OA reports */
264    struct exec_list sample_buffers;
265 
266    /* Cached list of empty sample buffers */
267    struct exec_list free_sample_buffers;
268 
269    int n_active_oa_queries;
270    int n_active_pipeline_stats_queries;
271 
272    /* The number of queries depending on running OA counters which
273     * extends beyond brw_end_perf_query() since we need to wait until
274     * the last MI_RPC command has parsed by the GPU.
275     *
276     * Accurate accounting is important here as emitting an
277     * MI_REPORT_PERF_COUNT command while the OA unit is disabled will
278     * effectively hang the gpu.
279     */
280    int n_oa_users;
281 
282    /* To help catch an spurious problem with the hardware or perf
283     * forwarding samples, we emit each MI_REPORT_PERF_COUNT command
284     * with a unique ID that we can explicitly check for...
285     */
286    int next_query_start_report_id;
287 
288    /**
289     * An array of queries whose results haven't yet been assembled
290     * based on the data in buffer objects.
291     *
292     * These may be active, or have already ended.  However, the
293     * results have not been requested.
294     */
295    struct intel_perf_query_object **unaccumulated;
296    int unaccumulated_elements;
297    int unaccumulated_array_size;
298 
299    /* The total number of query objects so we can relinquish
300     * our exclusive access to perf if the application deletes
301     * all of its objects. (NB: We only disable perf while
302     * there are no active queries)
303     */
304    int n_query_instances;
305 
306    int period_exponent;
307 };
308 
309 static bool
inc_n_users(struct intel_perf_context * perf_ctx)310 inc_n_users(struct intel_perf_context *perf_ctx)
311 {
312    if (perf_ctx->n_oa_users == 0 &&
313        intel_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_ENABLE, 0) < 0)
314    {
315       return false;
316    }
317    ++perf_ctx->n_oa_users;
318 
319    return true;
320 }
321 
322 static void
dec_n_users(struct intel_perf_context * perf_ctx)323 dec_n_users(struct intel_perf_context *perf_ctx)
324 {
325    /* Disabling the i915 perf stream will effectively disable the OA
326     * counters.  Note it's important to be sure there are no outstanding
327     * MI_RPC commands at this point since they could stall the CS
328     * indefinitely once OACONTROL is disabled.
329     */
330    --perf_ctx->n_oa_users;
331    if (perf_ctx->n_oa_users == 0 &&
332        intel_ioctl(perf_ctx->oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0)
333    {
334       DBG("WARNING: Error disabling gen perf stream: %m\n");
335    }
336 }
337 
338 void
intel_perf_close(struct intel_perf_context * perfquery,const struct intel_perf_query_info * query)339 intel_perf_close(struct intel_perf_context *perfquery,
340                  const struct intel_perf_query_info *query)
341 {
342    if (perfquery->oa_stream_fd != -1) {
343       close(perfquery->oa_stream_fd);
344       perfquery->oa_stream_fd = -1;
345    }
346    if (query && query->kind == INTEL_PERF_QUERY_TYPE_RAW) {
347       struct intel_perf_query_info *raw_query =
348          (struct intel_perf_query_info *) query;
349       raw_query->oa_metrics_set_id = 0;
350    }
351 }
352 
353 bool
intel_perf_open(struct intel_perf_context * perf_ctx,int metrics_set_id,int report_format,int period_exponent,int drm_fd,uint32_t ctx_id,bool enable)354 intel_perf_open(struct intel_perf_context *perf_ctx,
355                 int metrics_set_id,
356                 int report_format,
357                 int period_exponent,
358                 int drm_fd,
359                 uint32_t ctx_id,
360                 bool enable)
361 {
362    uint64_t properties[DRM_I915_PERF_PROP_MAX * 2];
363    uint32_t p = 0;
364 
365    /* Single context sampling if valid context id. */
366    if (ctx_id != INTEL_PERF_INVALID_CTX_ID) {
367       properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE;
368       properties[p++] = ctx_id;
369    }
370 
371    /* Include OA reports in samples */
372    properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA;
373    properties[p++] = true;
374 
375    /* OA unit configuration */
376    properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET;
377    properties[p++] = metrics_set_id;
378 
379    properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT;
380    properties[p++] = report_format;
381 
382    properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT;
383    properties[p++] = period_exponent;
384 
385    /* If global SSEU is available, pin it to the default. This will ensure on
386     * Gfx11 for instance we use the full EU array. Initially when perf was
387     * enabled we would use only half on Gfx11 because of functional
388     * requirements.
389     *
390     * Temporary disable this option on Gfx12.5+, kernel doesn't appear to
391     * support it.
392     */
393    if (intel_perf_has_global_sseu(perf_ctx->perf) &&
394        perf_ctx->devinfo->verx10 < 125) {
395       properties[p++] = DRM_I915_PERF_PROP_GLOBAL_SSEU;
396       properties[p++] = to_user_pointer(&perf_ctx->perf->sseu);
397    }
398 
399    assert(p <= ARRAY_SIZE(properties));
400 
401    struct drm_i915_perf_open_param param = {
402       .flags = I915_PERF_FLAG_FD_CLOEXEC |
403                I915_PERF_FLAG_FD_NONBLOCK |
404                (enable ? 0 : I915_PERF_FLAG_DISABLED),
405       .num_properties = p / 2,
406       .properties_ptr = (uintptr_t) properties,
407    };
408    int fd = intel_ioctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, &param);
409    if (fd == -1) {
410       DBG("Error opening gen perf OA stream: %m\n");
411       return false;
412    }
413 
414    perf_ctx->oa_stream_fd = fd;
415 
416    perf_ctx->current_oa_metrics_set_id = metrics_set_id;
417    perf_ctx->current_oa_format = report_format;
418 
419    if (enable)
420       ++perf_ctx->n_oa_users;
421 
422    return true;
423 }
424 
425 static uint64_t
get_metric_id(struct intel_perf_config * perf,const struct intel_perf_query_info * query)426 get_metric_id(struct intel_perf_config *perf,
427               const struct intel_perf_query_info *query)
428 {
429    /* These queries are know not to ever change, their config ID has been
430     * loaded upon the first query creation. No need to look them up again.
431     */
432    if (query->kind == INTEL_PERF_QUERY_TYPE_OA)
433       return query->oa_metrics_set_id;
434 
435    assert(query->kind == INTEL_PERF_QUERY_TYPE_RAW);
436 
437    /* Raw queries can be reprogrammed up by an external application/library.
438     * When a raw query is used for the first time it's id is set to a value !=
439     * 0. When it stops being used the id returns to 0. No need to reload the
440     * ID when it's already loaded.
441     */
442    if (query->oa_metrics_set_id != 0) {
443       DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n",
444           query->name, query->guid, query->oa_metrics_set_id);
445       return query->oa_metrics_set_id;
446    }
447 
448    struct intel_perf_query_info *raw_query = (struct intel_perf_query_info *)query;
449    if (!intel_perf_load_metric_id(perf, query->guid,
450                                 &raw_query->oa_metrics_set_id)) {
451       DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid);
452       raw_query->oa_metrics_set_id = perf->fallback_raw_oa_metric;
453    } else {
454       DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n",
455           query->name, query->guid, query->oa_metrics_set_id);
456    }
457    return query->oa_metrics_set_id;
458 }
459 
460 static struct oa_sample_buf *
get_free_sample_buf(struct intel_perf_context * perf_ctx)461 get_free_sample_buf(struct intel_perf_context *perf_ctx)
462 {
463    struct exec_node *node = exec_list_pop_head(&perf_ctx->free_sample_buffers);
464    struct oa_sample_buf *buf;
465 
466    if (node)
467       buf = exec_node_data(struct oa_sample_buf, node, link);
468    else {
469       buf = ralloc_size(perf_ctx->perf, sizeof(*buf));
470 
471       exec_node_init(&buf->link);
472       buf->refcount = 0;
473    }
474    buf->len = 0;
475 
476    return buf;
477 }
478 
479 static void
reap_old_sample_buffers(struct intel_perf_context * perf_ctx)480 reap_old_sample_buffers(struct intel_perf_context *perf_ctx)
481 {
482    struct exec_node *tail_node =
483       exec_list_get_tail(&perf_ctx->sample_buffers);
484    struct oa_sample_buf *tail_buf =
485       exec_node_data(struct oa_sample_buf, tail_node, link);
486 
487    /* Remove all old, unreferenced sample buffers walking forward from
488     * the head of the list, except always leave at least one node in
489     * the list so we always have a node to reference when we Begin
490     * a new query.
491     */
492    foreach_list_typed_safe(struct oa_sample_buf, buf, link,
493                            &perf_ctx->sample_buffers)
494    {
495       if (buf->refcount == 0 && buf != tail_buf) {
496          exec_node_remove(&buf->link);
497          exec_list_push_head(&perf_ctx->free_sample_buffers, &buf->link);
498       } else
499          return;
500    }
501 }
502 
503 static void
free_sample_bufs(struct intel_perf_context * perf_ctx)504 free_sample_bufs(struct intel_perf_context *perf_ctx)
505 {
506    foreach_list_typed_safe(struct oa_sample_buf, buf, link,
507                            &perf_ctx->free_sample_buffers)
508       ralloc_free(buf);
509 
510    exec_list_make_empty(&perf_ctx->free_sample_buffers);
511 }
512 
513 
514 struct intel_perf_query_object *
intel_perf_new_query(struct intel_perf_context * perf_ctx,unsigned query_index)515 intel_perf_new_query(struct intel_perf_context *perf_ctx, unsigned query_index)
516 {
517    const struct intel_perf_query_info *query =
518       &perf_ctx->perf->queries[query_index];
519 
520    switch (query->kind) {
521    case INTEL_PERF_QUERY_TYPE_OA:
522    case INTEL_PERF_QUERY_TYPE_RAW:
523       if (perf_ctx->period_exponent == 0)
524          return NULL;
525       break;
526    case INTEL_PERF_QUERY_TYPE_PIPELINE:
527       break;
528    }
529 
530    struct intel_perf_query_object *obj =
531       calloc(1, sizeof(struct intel_perf_query_object));
532 
533    if (!obj)
534       return NULL;
535 
536    obj->queryinfo = query;
537 
538    perf_ctx->n_query_instances++;
539    return obj;
540 }
541 
542 int
intel_perf_active_queries(struct intel_perf_context * perf_ctx,const struct intel_perf_query_info * query)543 intel_perf_active_queries(struct intel_perf_context *perf_ctx,
544                           const struct intel_perf_query_info *query)
545 {
546    assert(perf_ctx->n_active_oa_queries == 0 || perf_ctx->n_active_pipeline_stats_queries == 0);
547 
548    switch (query->kind) {
549    case INTEL_PERF_QUERY_TYPE_OA:
550    case INTEL_PERF_QUERY_TYPE_RAW:
551       return perf_ctx->n_active_oa_queries;
552       break;
553 
554    case INTEL_PERF_QUERY_TYPE_PIPELINE:
555       return perf_ctx->n_active_pipeline_stats_queries;
556       break;
557 
558    default:
559       unreachable("Unknown query type");
560       break;
561    }
562 }
563 
564 const struct intel_perf_query_info*
intel_perf_query_info(const struct intel_perf_query_object * query)565 intel_perf_query_info(const struct intel_perf_query_object *query)
566 {
567    return query->queryinfo;
568 }
569 
570 struct intel_perf_context *
intel_perf_new_context(void * parent)571 intel_perf_new_context(void *parent)
572 {
573    struct intel_perf_context *ctx = rzalloc(parent, struct intel_perf_context);
574    if (! ctx)
575       fprintf(stderr, "%s: failed to alloc context\n", __func__);
576    return ctx;
577 }
578 
579 struct intel_perf_config *
intel_perf_config(struct intel_perf_context * ctx)580 intel_perf_config(struct intel_perf_context *ctx)
581 {
582    return ctx->perf;
583 }
584 
585 void
intel_perf_init_context(struct intel_perf_context * perf_ctx,struct intel_perf_config * perf_cfg,void * mem_ctx,void * ctx,void * bufmgr,const struct intel_device_info * devinfo,uint32_t hw_ctx,int drm_fd)586 intel_perf_init_context(struct intel_perf_context *perf_ctx,
587                         struct intel_perf_config *perf_cfg,
588                         void * mem_ctx, /* ralloc context */
589                         void * ctx,  /* driver context (eg, brw_context) */
590                         void * bufmgr,  /* eg brw_bufmgr */
591                         const struct intel_device_info *devinfo,
592                         uint32_t hw_ctx,
593                         int drm_fd)
594 {
595    perf_ctx->perf = perf_cfg;
596    perf_ctx->mem_ctx = mem_ctx;
597    perf_ctx->ctx = ctx;
598    perf_ctx->bufmgr = bufmgr;
599    perf_ctx->drm_fd = drm_fd;
600    perf_ctx->hw_ctx = hw_ctx;
601    perf_ctx->devinfo = devinfo;
602 
603    perf_ctx->unaccumulated =
604       ralloc_array(mem_ctx, struct intel_perf_query_object *, 2);
605    perf_ctx->unaccumulated_elements = 0;
606    perf_ctx->unaccumulated_array_size = 2;
607 
608    exec_list_make_empty(&perf_ctx->sample_buffers);
609    exec_list_make_empty(&perf_ctx->free_sample_buffers);
610 
611    /* It's convenient to guarantee that this linked list of sample
612     * buffers is never empty so we add an empty head so when we
613     * Begin an OA query we can always take a reference on a buffer
614     * in this list.
615     */
616    struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
617    exec_list_push_head(&perf_ctx->sample_buffers, &buf->link);
618 
619    perf_ctx->oa_stream_fd = -1;
620    perf_ctx->next_query_start_report_id = 1000;
621 
622    /* The period_exponent gives a sampling period as follows:
623     *   sample_period = timestamp_period * 2^(period_exponent + 1)
624     *
625     * The timestamps increments every 80ns (HSW), ~52ns (GFX9LP) or
626     * ~83ns (GFX8/9).
627     *
628     * The counter overflow period is derived from the EuActive counter
629     * which reads a counter that increments by the number of clock
630     * cycles multiplied by the number of EUs. It can be calculated as:
631     *
632     * 2^(number of bits in A counter) / (n_eus * max_intel_freq * 2)
633     *
634     * (E.g. 40 EUs @ 1GHz = ~53ms)
635     *
636     * We select a sampling period inferior to that overflow period to
637     * ensure we cannot see more than 1 counter overflow, otherwise we
638     * could loose information.
639     */
640 
641    int a_counter_in_bits = 32;
642    if (devinfo->ver >= 8)
643       a_counter_in_bits = 40;
644 
645    uint64_t overflow_period = pow(2, a_counter_in_bits) / (perf_cfg->sys_vars.n_eus *
646        /* drop 1GHz freq to have units in nanoseconds */
647        2);
648 
649    DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
650        overflow_period, overflow_period / 1000000ul, perf_cfg->sys_vars.n_eus);
651 
652    int period_exponent = 0;
653    uint64_t prev_sample_period, next_sample_period;
654    for (int e = 0; e < 30; e++) {
655       prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
656       next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
657 
658       /* Take the previous sampling period, lower than the overflow
659        * period.
660        */
661       if (prev_sample_period < overflow_period &&
662           next_sample_period > overflow_period)
663          period_exponent = e + 1;
664    }
665 
666    perf_ctx->period_exponent = period_exponent;
667 
668    if (period_exponent == 0) {
669       DBG("WARNING: enable to find a sampling exponent\n");
670    } else {
671       DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
672             prev_sample_period / 1000000ul);
673    }
674 }
675 
676 /**
677  * Add a query to the global list of "unaccumulated queries."
678  *
679  * Queries are tracked here until all the associated OA reports have
680  * been accumulated via accumulate_oa_reports() after the end
681  * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
682  */
683 static void
add_to_unaccumulated_query_list(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * obj)684 add_to_unaccumulated_query_list(struct intel_perf_context *perf_ctx,
685                                 struct intel_perf_query_object *obj)
686 {
687    if (perf_ctx->unaccumulated_elements >=
688        perf_ctx->unaccumulated_array_size)
689    {
690       perf_ctx->unaccumulated_array_size *= 1.5;
691       perf_ctx->unaccumulated =
692          reralloc(perf_ctx->mem_ctx, perf_ctx->unaccumulated,
693                   struct intel_perf_query_object *,
694                   perf_ctx->unaccumulated_array_size);
695    }
696 
697    perf_ctx->unaccumulated[perf_ctx->unaccumulated_elements++] = obj;
698 }
699 
700 /**
701  * Emit MI_STORE_REGISTER_MEM commands to capture all of the
702  * pipeline statistics for the performance query object.
703  */
704 static void
snapshot_statistics_registers(struct intel_perf_context * ctx,struct intel_perf_query_object * obj,uint32_t offset_in_bytes)705 snapshot_statistics_registers(struct intel_perf_context *ctx,
706                               struct intel_perf_query_object *obj,
707                               uint32_t offset_in_bytes)
708 {
709    struct intel_perf_config *perf = ctx->perf;
710    const struct intel_perf_query_info *query = obj->queryinfo;
711    const int n_counters = query->n_counters;
712 
713    for (int i = 0; i < n_counters; i++) {
714       const struct intel_perf_query_counter *counter = &query->counters[i];
715 
716       assert(counter->data_type == INTEL_PERF_COUNTER_DATA_TYPE_UINT64);
717 
718       perf->vtbl.store_register_mem(ctx->ctx, obj->pipeline_stats.bo,
719                                     counter->pipeline_stat.reg, 8,
720                                     offset_in_bytes + counter->offset);
721    }
722 }
723 
724 static void
snapshot_query_layout(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query,bool end_snapshot)725 snapshot_query_layout(struct intel_perf_context *perf_ctx,
726                       struct intel_perf_query_object *query,
727                       bool end_snapshot)
728 {
729    struct intel_perf_config *perf_cfg = perf_ctx->perf;
730    const struct intel_perf_query_field_layout *layout = &perf_cfg->query_layout;
731    uint32_t offset = end_snapshot ? align(layout->size, layout->alignment) : 0;
732 
733    for (uint32_t f = 0; f < layout->n_fields; f++) {
734       const struct intel_perf_query_field *field =
735          &layout->fields[end_snapshot ? f : (layout->n_fields - 1 - f)];
736 
737       switch (field->type) {
738       case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
739          perf_cfg->vtbl.emit_mi_report_perf_count(perf_ctx->ctx, query->oa.bo,
740                                                   offset + field->location,
741                                                   query->oa.begin_report_id +
742                                                   (end_snapshot ? 1 : 0));
743          break;
744       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
745       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
746       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
747       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
748       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
749          perf_cfg->vtbl.store_register_mem(perf_ctx->ctx, query->oa.bo,
750                                            field->mmio_offset, field->size,
751                                            offset + field->location);
752          break;
753       default:
754          unreachable("Invalid field type");
755       }
756    }
757 }
758 
759 bool
intel_perf_begin_query(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query)760 intel_perf_begin_query(struct intel_perf_context *perf_ctx,
761                        struct intel_perf_query_object *query)
762 {
763    struct intel_perf_config *perf_cfg = perf_ctx->perf;
764    const struct intel_perf_query_info *queryinfo = query->queryinfo;
765 
766    /* XXX: We have to consider that the command parser unit that parses batch
767     * buffer commands and is used to capture begin/end counter snapshots isn't
768     * implicitly synchronized with what's currently running across other GPU
769     * units (such as the EUs running shaders) that the performance counters are
770     * associated with.
771     *
772     * The intention of performance queries is to measure the work associated
773     * with commands between the begin/end delimiters and so for that to be the
774     * case we need to explicitly synchronize the parsing of commands to capture
775     * Begin/End counter snapshots with what's running across other parts of the
776     * GPU.
777     *
778     * When the command parser reaches a Begin marker it effectively needs to
779     * drain everything currently running on the GPU until the hardware is idle
780     * before capturing the first snapshot of counters - otherwise the results
781     * would also be measuring the effects of earlier commands.
782     *
783     * When the command parser reaches an End marker it needs to stall until
784     * everything currently running on the GPU has finished before capturing the
785     * end snapshot - otherwise the results won't be a complete representation
786     * of the work.
787     *
788     * To achieve this, we stall the pipeline at pixel scoreboard (prevent any
789     * additional work to be processed by the pipeline until all pixels of the
790     * previous draw has be completed).
791     *
792     * N.B. The final results are based on deltas of counters between (inside)
793     * Begin/End markers so even though the total wall clock time of the
794     * workload is stretched by larger pipeline bubbles the bubbles themselves
795     * are generally invisible to the query results. Whether that's a good or a
796     * bad thing depends on the use case. For a lower real-time impact while
797     * capturing metrics then periodic sampling may be a better choice than
798     * INTEL_performance_query.
799     *
800     *
801     * This is our Begin synchronization point to drain current work on the
802     * GPU before we capture our first counter snapshot...
803     */
804    perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
805 
806    switch (queryinfo->kind) {
807    case INTEL_PERF_QUERY_TYPE_OA:
808    case INTEL_PERF_QUERY_TYPE_RAW: {
809 
810       /* Opening an i915 perf stream implies exclusive access to the OA unit
811        * which will generate counter reports for a specific counter set with a
812        * specific layout/format so we can't begin any OA based queries that
813        * require a different counter set or format unless we get an opportunity
814        * to close the stream and open a new one...
815        */
816       uint64_t metric_id = get_metric_id(perf_ctx->perf, queryinfo);
817 
818       if (perf_ctx->oa_stream_fd != -1 &&
819           perf_ctx->current_oa_metrics_set_id != metric_id) {
820 
821          if (perf_ctx->n_oa_users != 0) {
822             DBG("WARNING: Begin failed already using perf config=%i/%"PRIu64"\n",
823                 perf_ctx->current_oa_metrics_set_id, metric_id);
824             return false;
825          } else
826             intel_perf_close(perf_ctx, queryinfo);
827       }
828 
829       /* If the OA counters aren't already on, enable them. */
830       if (perf_ctx->oa_stream_fd == -1) {
831          assert(perf_ctx->period_exponent != 0);
832 
833          if (!intel_perf_open(perf_ctx, metric_id, queryinfo->oa_format,
834                             perf_ctx->period_exponent, perf_ctx->drm_fd,
835                             perf_ctx->hw_ctx, false))
836             return false;
837       } else {
838          assert(perf_ctx->current_oa_metrics_set_id == metric_id &&
839                 perf_ctx->current_oa_format == queryinfo->oa_format);
840       }
841 
842       if (!inc_n_users(perf_ctx)) {
843          DBG("WARNING: Error enabling i915 perf stream: %m\n");
844          return false;
845       }
846 
847       if (query->oa.bo) {
848          perf_cfg->vtbl.bo_unreference(query->oa.bo);
849          query->oa.bo = NULL;
850       }
851 
852       query->oa.bo = perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
853                                              "perf. query OA MI_RPC bo",
854                                              MI_RPC_BO_SIZE);
855 #ifdef DEBUG
856       /* Pre-filling the BO helps debug whether writes landed. */
857       void *map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_WRITE);
858       memset(map, 0x80, MI_RPC_BO_SIZE);
859       perf_cfg->vtbl.bo_unmap(query->oa.bo);
860 #endif
861 
862       query->oa.begin_report_id = perf_ctx->next_query_start_report_id;
863       perf_ctx->next_query_start_report_id += 2;
864 
865       snapshot_query_layout(perf_ctx, query, false /* end_snapshot */);
866 
867       ++perf_ctx->n_active_oa_queries;
868 
869       /* No already-buffered samples can possibly be associated with this query
870        * so create a marker within the list of sample buffers enabling us to
871        * easily ignore earlier samples when processing this query after
872        * completion.
873        */
874       assert(!exec_list_is_empty(&perf_ctx->sample_buffers));
875       query->oa.samples_head = exec_list_get_tail(&perf_ctx->sample_buffers);
876 
877       struct oa_sample_buf *buf =
878          exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
879 
880       /* This reference will ensure that future/following sample
881        * buffers (that may relate to this query) can't be freed until
882        * this drops to zero.
883        */
884       buf->refcount++;
885 
886       intel_perf_query_result_clear(&query->oa.result);
887       query->oa.results_accumulated = false;
888 
889       add_to_unaccumulated_query_list(perf_ctx, query);
890       break;
891    }
892 
893    case INTEL_PERF_QUERY_TYPE_PIPELINE:
894       if (query->pipeline_stats.bo) {
895          perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
896          query->pipeline_stats.bo = NULL;
897       }
898 
899       query->pipeline_stats.bo =
900          perf_cfg->vtbl.bo_alloc(perf_ctx->bufmgr,
901                                  "perf. query pipeline stats bo",
902                                  STATS_BO_SIZE);
903 
904       /* Take starting snapshots. */
905       snapshot_statistics_registers(perf_ctx, query, 0);
906 
907       ++perf_ctx->n_active_pipeline_stats_queries;
908       break;
909 
910    default:
911       unreachable("Unknown query type");
912       break;
913    }
914 
915    return true;
916 }
917 
918 void
intel_perf_end_query(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query)919 intel_perf_end_query(struct intel_perf_context *perf_ctx,
920                      struct intel_perf_query_object *query)
921 {
922    struct intel_perf_config *perf_cfg = perf_ctx->perf;
923 
924    /* Ensure that the work associated with the queried commands will have
925     * finished before taking our query end counter readings.
926     *
927     * For more details see comment in brw_begin_perf_query for
928     * corresponding flush.
929     */
930    perf_cfg->vtbl.emit_stall_at_pixel_scoreboard(perf_ctx->ctx);
931 
932    switch (query->queryinfo->kind) {
933    case INTEL_PERF_QUERY_TYPE_OA:
934    case INTEL_PERF_QUERY_TYPE_RAW:
935 
936       /* NB: It's possible that the query will have already been marked
937        * as 'accumulated' if an error was seen while reading samples
938        * from perf. In this case we mustn't try and emit a closing
939        * MI_RPC command in case the OA unit has already been disabled
940        */
941       if (!query->oa.results_accumulated)
942          snapshot_query_layout(perf_ctx, query, true /* end_snapshot */);
943 
944       --perf_ctx->n_active_oa_queries;
945 
946       /* NB: even though the query has now ended, it can't be accumulated
947        * until the end MI_REPORT_PERF_COUNT snapshot has been written
948        * to query->oa.bo
949        */
950       break;
951 
952    case INTEL_PERF_QUERY_TYPE_PIPELINE:
953       snapshot_statistics_registers(perf_ctx, query,
954                                     STATS_BO_END_OFFSET_BYTES);
955       --perf_ctx->n_active_pipeline_stats_queries;
956       break;
957 
958    default:
959       unreachable("Unknown query type");
960       break;
961    }
962 }
963 
intel_perf_oa_stream_ready(struct intel_perf_context * perf_ctx)964 bool intel_perf_oa_stream_ready(struct intel_perf_context *perf_ctx)
965 {
966    struct pollfd pfd;
967 
968    pfd.fd = perf_ctx->oa_stream_fd;
969    pfd.events = POLLIN;
970    pfd.revents = 0;
971 
972    if (poll(&pfd, 1, 0) < 0) {
973       DBG("Error polling OA stream\n");
974       return false;
975    }
976 
977    if (!(pfd.revents & POLLIN))
978       return false;
979 
980    return true;
981 }
982 
983 ssize_t
intel_perf_read_oa_stream(struct intel_perf_context * perf_ctx,void * buf,size_t nbytes)984 intel_perf_read_oa_stream(struct intel_perf_context *perf_ctx,
985                           void* buf,
986                           size_t nbytes)
987 {
988    return read(perf_ctx->oa_stream_fd, buf, nbytes);
989 }
990 
991 enum OaReadStatus {
992    OA_READ_STATUS_ERROR,
993    OA_READ_STATUS_UNFINISHED,
994    OA_READ_STATUS_FINISHED,
995 };
996 
997 static enum OaReadStatus
read_oa_samples_until(struct intel_perf_context * perf_ctx,uint32_t start_timestamp,uint32_t end_timestamp)998 read_oa_samples_until(struct intel_perf_context *perf_ctx,
999                       uint32_t start_timestamp,
1000                       uint32_t end_timestamp)
1001 {
1002    struct exec_node *tail_node =
1003       exec_list_get_tail(&perf_ctx->sample_buffers);
1004    struct oa_sample_buf *tail_buf =
1005       exec_node_data(struct oa_sample_buf, tail_node, link);
1006    uint32_t last_timestamp =
1007       tail_buf->len == 0 ? start_timestamp : tail_buf->last_timestamp;
1008 
1009    while (1) {
1010       struct oa_sample_buf *buf = get_free_sample_buf(perf_ctx);
1011       uint32_t offset;
1012       int len;
1013 
1014       while ((len = read(perf_ctx->oa_stream_fd, buf->buf,
1015                          sizeof(buf->buf))) < 0 && errno == EINTR)
1016          ;
1017 
1018       if (len <= 0) {
1019          exec_list_push_tail(&perf_ctx->free_sample_buffers, &buf->link);
1020 
1021          if (len == 0) {
1022             DBG("Spurious EOF reading i915 perf samples\n");
1023             return OA_READ_STATUS_ERROR;
1024          }
1025 
1026          if (errno != EAGAIN) {
1027             DBG("Error reading i915 perf samples: %m\n");
1028             return OA_READ_STATUS_ERROR;
1029          }
1030 
1031          if ((last_timestamp - start_timestamp) >= INT32_MAX)
1032             return OA_READ_STATUS_UNFINISHED;
1033 
1034          if ((last_timestamp - start_timestamp) <
1035               (end_timestamp - start_timestamp))
1036             return OA_READ_STATUS_UNFINISHED;
1037 
1038          return OA_READ_STATUS_FINISHED;
1039       }
1040 
1041       buf->len = len;
1042       exec_list_push_tail(&perf_ctx->sample_buffers, &buf->link);
1043 
1044       /* Go through the reports and update the last timestamp. */
1045       offset = 0;
1046       while (offset < buf->len) {
1047          const struct drm_i915_perf_record_header *header =
1048             (const struct drm_i915_perf_record_header *) &buf->buf[offset];
1049          uint32_t *report = (uint32_t *) (header + 1);
1050 
1051          if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
1052             last_timestamp = report[1];
1053 
1054          offset += header->size;
1055       }
1056 
1057       buf->last_timestamp = last_timestamp;
1058    }
1059 
1060    unreachable("not reached");
1061    return OA_READ_STATUS_ERROR;
1062 }
1063 
1064 /**
1065  * Try to read all the reports until either the delimiting timestamp
1066  * or an error arises.
1067  */
1068 static bool
read_oa_samples_for_query(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query,void * current_batch)1069 read_oa_samples_for_query(struct intel_perf_context *perf_ctx,
1070                           struct intel_perf_query_object *query,
1071                           void *current_batch)
1072 {
1073    uint32_t *start;
1074    uint32_t *last;
1075    uint32_t *end;
1076    struct intel_perf_config *perf_cfg = perf_ctx->perf;
1077 
1078    /* We need the MI_REPORT_PERF_COUNT to land before we can start
1079     * accumulate. */
1080    assert(!perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
1081           !perf_cfg->vtbl.bo_busy(query->oa.bo));
1082 
1083    /* Map the BO once here and let accumulate_oa_reports() unmap
1084     * it. */
1085    if (query->oa.map == NULL)
1086       query->oa.map = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->oa.bo, MAP_READ);
1087 
1088    start = last = query->oa.map;
1089    end = query->oa.map + perf_ctx->perf->query_layout.size;
1090 
1091    if (start[0] != query->oa.begin_report_id) {
1092       DBG("Spurious start report id=%"PRIu32"\n", start[0]);
1093       return true;
1094    }
1095    if (end[0] != (query->oa.begin_report_id + 1)) {
1096       DBG("Spurious end report id=%"PRIu32"\n", end[0]);
1097       return true;
1098    }
1099 
1100    /* Read the reports until the end timestamp. */
1101    switch (read_oa_samples_until(perf_ctx, start[1], end[1])) {
1102    case OA_READ_STATUS_ERROR:
1103       FALLTHROUGH; /* Let accumulate_oa_reports() deal with the error. */
1104    case OA_READ_STATUS_FINISHED:
1105       return true;
1106    case OA_READ_STATUS_UNFINISHED:
1107       return false;
1108    }
1109 
1110    unreachable("invalid read status");
1111    return false;
1112 }
1113 
1114 void
intel_perf_wait_query(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query,void * current_batch)1115 intel_perf_wait_query(struct intel_perf_context *perf_ctx,
1116                       struct intel_perf_query_object *query,
1117                       void *current_batch)
1118 {
1119    struct intel_perf_config *perf_cfg = perf_ctx->perf;
1120    struct brw_bo *bo = NULL;
1121 
1122    switch (query->queryinfo->kind) {
1123    case INTEL_PERF_QUERY_TYPE_OA:
1124    case INTEL_PERF_QUERY_TYPE_RAW:
1125       bo = query->oa.bo;
1126       break;
1127 
1128    case INTEL_PERF_QUERY_TYPE_PIPELINE:
1129       bo = query->pipeline_stats.bo;
1130       break;
1131 
1132    default:
1133       unreachable("Unknown query type");
1134       break;
1135    }
1136 
1137    if (bo == NULL)
1138       return;
1139 
1140    /* If the current batch references our results bo then we need to
1141     * flush first...
1142     */
1143    if (perf_cfg->vtbl.batch_references(current_batch, bo))
1144       perf_cfg->vtbl.batchbuffer_flush(perf_ctx->ctx, __FILE__, __LINE__);
1145 
1146    perf_cfg->vtbl.bo_wait_rendering(bo);
1147 }
1148 
1149 bool
intel_perf_is_query_ready(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query,void * current_batch)1150 intel_perf_is_query_ready(struct intel_perf_context *perf_ctx,
1151                           struct intel_perf_query_object *query,
1152                           void *current_batch)
1153 {
1154    struct intel_perf_config *perf_cfg = perf_ctx->perf;
1155 
1156    switch (query->queryinfo->kind) {
1157    case INTEL_PERF_QUERY_TYPE_OA:
1158    case INTEL_PERF_QUERY_TYPE_RAW:
1159       return (query->oa.results_accumulated ||
1160               (query->oa.bo &&
1161                !perf_cfg->vtbl.batch_references(current_batch, query->oa.bo) &&
1162                !perf_cfg->vtbl.bo_busy(query->oa.bo)));
1163 
1164    case INTEL_PERF_QUERY_TYPE_PIPELINE:
1165       return (query->pipeline_stats.bo &&
1166               !perf_cfg->vtbl.batch_references(current_batch, query->pipeline_stats.bo) &&
1167               !perf_cfg->vtbl.bo_busy(query->pipeline_stats.bo));
1168 
1169    default:
1170       unreachable("Unknown query type");
1171       break;
1172    }
1173 
1174    return false;
1175 }
1176 
1177 /**
1178  * Remove a query from the global list of unaccumulated queries once
1179  * after successfully accumulating the OA reports associated with the
1180  * query in accumulate_oa_reports() or when discarding unwanted query
1181  * results.
1182  */
1183 static void
drop_from_unaccumulated_query_list(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query)1184 drop_from_unaccumulated_query_list(struct intel_perf_context *perf_ctx,
1185                                    struct intel_perf_query_object *query)
1186 {
1187    for (int i = 0; i < perf_ctx->unaccumulated_elements; i++) {
1188       if (perf_ctx->unaccumulated[i] == query) {
1189          int last_elt = --perf_ctx->unaccumulated_elements;
1190 
1191          if (i == last_elt)
1192             perf_ctx->unaccumulated[i] = NULL;
1193          else {
1194             perf_ctx->unaccumulated[i] =
1195                perf_ctx->unaccumulated[last_elt];
1196          }
1197 
1198          break;
1199       }
1200    }
1201 
1202    /* Drop our samples_head reference so that associated periodic
1203     * sample data buffers can potentially be reaped if they aren't
1204     * referenced by any other queries...
1205     */
1206 
1207    struct oa_sample_buf *buf =
1208       exec_node_data(struct oa_sample_buf, query->oa.samples_head, link);
1209 
1210    assert(buf->refcount > 0);
1211    buf->refcount--;
1212 
1213    query->oa.samples_head = NULL;
1214 
1215    reap_old_sample_buffers(perf_ctx);
1216 }
1217 
1218 /* In general if we see anything spurious while accumulating results,
1219  * we don't try and continue accumulating the current query, hoping
1220  * for the best, we scrap anything outstanding, and then hope for the
1221  * best with new queries.
1222  */
1223 static void
discard_all_queries(struct intel_perf_context * perf_ctx)1224 discard_all_queries(struct intel_perf_context *perf_ctx)
1225 {
1226    while (perf_ctx->unaccumulated_elements) {
1227       struct intel_perf_query_object *query = perf_ctx->unaccumulated[0];
1228 
1229       query->oa.results_accumulated = true;
1230       drop_from_unaccumulated_query_list(perf_ctx, query);
1231 
1232       dec_n_users(perf_ctx);
1233    }
1234 }
1235 
1236 /* Looks for the validity bit of context ID (dword 2) of an OA report. */
1237 static bool
oa_report_ctx_id_valid(const struct intel_device_info * devinfo,const uint32_t * report)1238 oa_report_ctx_id_valid(const struct intel_device_info *devinfo,
1239                        const uint32_t *report)
1240 {
1241    assert(devinfo->ver >= 8);
1242    if (devinfo->ver == 8)
1243       return (report[0] & (1 << 25)) != 0;
1244    return (report[0] & (1 << 16)) != 0;
1245 }
1246 
1247 /**
1248  * Accumulate raw OA counter values based on deltas between pairs of
1249  * OA reports.
1250  *
1251  * Accumulation starts from the first report captured via
1252  * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
1253  * last MI_RPC report requested by brw_end_perf_query(). Between these
1254  * two reports there may also some number of periodically sampled OA
1255  * reports collected via the i915 perf interface - depending on the
1256  * duration of the query.
1257  *
1258  * These periodic snapshots help to ensure we handle counter overflow
1259  * correctly by being frequent enough to ensure we don't miss multiple
1260  * overflows of a counter between snapshots. For Gfx8+ the i915 perf
1261  * snapshots provide the extra context-switch reports that let us
1262  * subtract out the progress of counters associated with other
1263  * contexts running on the system.
1264  */
1265 static void
accumulate_oa_reports(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query)1266 accumulate_oa_reports(struct intel_perf_context *perf_ctx,
1267                       struct intel_perf_query_object *query)
1268 {
1269    const struct intel_device_info *devinfo = perf_ctx->devinfo;
1270    uint32_t *start;
1271    uint32_t *last;
1272    uint32_t *end;
1273    struct exec_node *first_samples_node;
1274    bool last_report_ctx_match = true;
1275    int out_duration = 0;
1276 
1277    assert(query->oa.map != NULL);
1278 
1279    start = last = query->oa.map;
1280    end = query->oa.map + perf_ctx->perf->query_layout.size;
1281 
1282    if (start[0] != query->oa.begin_report_id) {
1283       DBG("Spurious start report id=%"PRIu32"\n", start[0]);
1284       goto error;
1285    }
1286    if (end[0] != (query->oa.begin_report_id + 1)) {
1287       DBG("Spurious end report id=%"PRIu32"\n", end[0]);
1288       goto error;
1289    }
1290 
1291    /* On Gfx12+ OA reports are sourced from per context counters, so we don't
1292     * ever have to look at the global OA buffer. Yey \o/
1293     */
1294    if (perf_ctx->devinfo->ver >= 12) {
1295       last = start;
1296       goto end;
1297    }
1298 
1299    /* See if we have any periodic reports to accumulate too... */
1300 
1301    /* N.B. The oa.samples_head was set when the query began and
1302     * pointed to the tail of the perf_ctx->sample_buffers list at
1303     * the time the query started. Since the buffer existed before the
1304     * first MI_REPORT_PERF_COUNT command was emitted we therefore know
1305     * that no data in this particular node's buffer can possibly be
1306     * associated with the query - so skip ahead one...
1307     */
1308    first_samples_node = query->oa.samples_head->next;
1309 
1310    foreach_list_typed_from(struct oa_sample_buf, buf, link,
1311                            &perf_ctx->sample_buffers,
1312                            first_samples_node)
1313    {
1314       int offset = 0;
1315 
1316       while (offset < buf->len) {
1317          const struct drm_i915_perf_record_header *header =
1318             (const struct drm_i915_perf_record_header *)(buf->buf + offset);
1319 
1320          assert(header->size != 0);
1321          assert(header->size <= buf->len);
1322 
1323          offset += header->size;
1324 
1325          switch (header->type) {
1326          case DRM_I915_PERF_RECORD_SAMPLE: {
1327             uint32_t *report = (uint32_t *)(header + 1);
1328             bool report_ctx_match = true;
1329             bool add = true;
1330 
1331             /* Ignore reports that come before the start marker.
1332              * (Note: takes care to allow overflow of 32bit timestamps)
1333              */
1334             if (intel_device_info_timebase_scale(devinfo,
1335                                                report[1] - start[1]) > 5000000000) {
1336                continue;
1337             }
1338 
1339             /* Ignore reports that come after the end marker.
1340              * (Note: takes care to allow overflow of 32bit timestamps)
1341              */
1342             if (intel_device_info_timebase_scale(devinfo,
1343                                                report[1] - end[1]) <= 5000000000) {
1344                goto end;
1345             }
1346 
1347             /* For Gfx8+ since the counters continue while other
1348              * contexts are running we need to discount any unrelated
1349              * deltas. The hardware automatically generates a report
1350              * on context switch which gives us a new reference point
1351              * to continuing adding deltas from.
1352              *
1353              * For Haswell we can rely on the HW to stop the progress
1354              * of OA counters while any other context is acctive.
1355              */
1356             if (devinfo->ver >= 8) {
1357                /* Consider that the current report matches our context only if
1358                 * the report says the report ID is valid.
1359                 */
1360                report_ctx_match = oa_report_ctx_id_valid(devinfo, report) &&
1361                   report[2] == start[2];
1362                if (report_ctx_match)
1363                   out_duration = 0;
1364                else
1365                   out_duration++;
1366 
1367                /* Only add the delta between <last, report> if the last report
1368                 * was clearly identified as our context, or if we have at most
1369                 * 1 report without a matching ID.
1370                 *
1371                 * The OA unit will sometimes label reports with an invalid
1372                 * context ID when i915 rewrites the execlist submit register
1373                 * with the same context as the one currently running. This
1374                 * happens when i915 wants to notify the HW of ringbuffer tail
1375                 * register update. We have to consider this report as part of
1376                 * our context as the 3d pipeline behind the OACS unit is still
1377                 * processing the operations started at the previous execlist
1378                 * submission.
1379                 */
1380                add = last_report_ctx_match && out_duration < 2;
1381             }
1382 
1383             if (add) {
1384                intel_perf_query_result_accumulate(&query->oa.result,
1385                                                 query->queryinfo,
1386                                                 last, report);
1387             } else {
1388                /* We're not adding the delta because we've identified it's not
1389                 * for the context we filter for. We can consider that the
1390                 * query was split.
1391                 */
1392                query->oa.result.query_disjoint = true;
1393             }
1394 
1395             last = report;
1396             last_report_ctx_match = report_ctx_match;
1397 
1398             break;
1399          }
1400 
1401          case DRM_I915_PERF_RECORD_OA_BUFFER_LOST:
1402              DBG("i915 perf: OA error: all reports lost\n");
1403              goto error;
1404          case DRM_I915_PERF_RECORD_OA_REPORT_LOST:
1405              DBG("i915 perf: OA report lost\n");
1406              break;
1407          }
1408       }
1409    }
1410 
1411 end:
1412 
1413    intel_perf_query_result_accumulate(&query->oa.result, query->queryinfo,
1414                                     last, end);
1415 
1416    query->oa.results_accumulated = true;
1417    drop_from_unaccumulated_query_list(perf_ctx, query);
1418    dec_n_users(perf_ctx);
1419 
1420    return;
1421 
1422 error:
1423 
1424    discard_all_queries(perf_ctx);
1425 }
1426 
1427 void
intel_perf_delete_query(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query)1428 intel_perf_delete_query(struct intel_perf_context *perf_ctx,
1429                         struct intel_perf_query_object *query)
1430 {
1431    struct intel_perf_config *perf_cfg = perf_ctx->perf;
1432 
1433    /* We can assume that the frontend waits for a query to complete
1434     * before ever calling into here, so we don't have to worry about
1435     * deleting an in-flight query object.
1436     */
1437    switch (query->queryinfo->kind) {
1438    case INTEL_PERF_QUERY_TYPE_OA:
1439    case INTEL_PERF_QUERY_TYPE_RAW:
1440       if (query->oa.bo) {
1441          if (!query->oa.results_accumulated) {
1442             drop_from_unaccumulated_query_list(perf_ctx, query);
1443             dec_n_users(perf_ctx);
1444          }
1445 
1446          perf_cfg->vtbl.bo_unreference(query->oa.bo);
1447          query->oa.bo = NULL;
1448       }
1449 
1450       query->oa.results_accumulated = false;
1451       break;
1452 
1453    case INTEL_PERF_QUERY_TYPE_PIPELINE:
1454       if (query->pipeline_stats.bo) {
1455          perf_cfg->vtbl.bo_unreference(query->pipeline_stats.bo);
1456          query->pipeline_stats.bo = NULL;
1457       }
1458       break;
1459 
1460    default:
1461       unreachable("Unknown query type");
1462       break;
1463    }
1464 
1465    /* As an indication that the INTEL_performance_query extension is no
1466     * longer in use, it's a good time to free our cache of sample
1467     * buffers and close any current i915-perf stream.
1468     */
1469    if (--perf_ctx->n_query_instances == 0) {
1470       free_sample_bufs(perf_ctx);
1471       intel_perf_close(perf_ctx, query->queryinfo);
1472    }
1473 
1474    free(query);
1475 }
1476 
1477 static int
get_oa_counter_data(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query,size_t data_size,uint8_t * data)1478 get_oa_counter_data(struct intel_perf_context *perf_ctx,
1479                     struct intel_perf_query_object *query,
1480                     size_t data_size,
1481                     uint8_t *data)
1482 {
1483    struct intel_perf_config *perf_cfg = perf_ctx->perf;
1484    const struct intel_perf_query_info *queryinfo = query->queryinfo;
1485    int n_counters = queryinfo->n_counters;
1486    int written = 0;
1487 
1488    for (int i = 0; i < n_counters; i++) {
1489       const struct intel_perf_query_counter *counter = &queryinfo->counters[i];
1490       uint64_t *out_uint64;
1491       float *out_float;
1492       size_t counter_size = intel_perf_query_counter_get_size(counter);
1493 
1494       if (counter_size) {
1495          switch (counter->data_type) {
1496          case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
1497             out_uint64 = (uint64_t *)(data + counter->offset);
1498             *out_uint64 =
1499                counter->oa_counter_read_uint64(perf_cfg, queryinfo,
1500                                                &query->oa.result);
1501             break;
1502          case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
1503             out_float = (float *)(data + counter->offset);
1504             *out_float =
1505                counter->oa_counter_read_float(perf_cfg, queryinfo,
1506                                               &query->oa.result);
1507             break;
1508          default:
1509             /* So far we aren't using uint32, double or bool32... */
1510             unreachable("unexpected counter data type");
1511          }
1512 
1513          if (counter->offset + counter_size > written)
1514             written = counter->offset + counter_size;
1515       }
1516    }
1517 
1518    return written;
1519 }
1520 
1521 static int
get_pipeline_stats_data(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query,size_t data_size,uint8_t * data)1522 get_pipeline_stats_data(struct intel_perf_context *perf_ctx,
1523                         struct intel_perf_query_object *query,
1524                         size_t data_size,
1525                         uint8_t *data)
1526 
1527 {
1528    struct intel_perf_config *perf_cfg = perf_ctx->perf;
1529    const struct intel_perf_query_info *queryinfo = query->queryinfo;
1530    int n_counters = queryinfo->n_counters;
1531    uint8_t *p = data;
1532 
1533    uint64_t *start = perf_cfg->vtbl.bo_map(perf_ctx->ctx, query->pipeline_stats.bo, MAP_READ);
1534    uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
1535 
1536    for (int i = 0; i < n_counters; i++) {
1537       const struct intel_perf_query_counter *counter = &queryinfo->counters[i];
1538       uint64_t value = end[i] - start[i];
1539 
1540       if (counter->pipeline_stat.numerator !=
1541           counter->pipeline_stat.denominator) {
1542          value *= counter->pipeline_stat.numerator;
1543          value /= counter->pipeline_stat.denominator;
1544       }
1545 
1546       *((uint64_t *)p) = value;
1547       p += 8;
1548    }
1549 
1550    perf_cfg->vtbl.bo_unmap(query->pipeline_stats.bo);
1551 
1552    return p - data;
1553 }
1554 
1555 void
intel_perf_get_query_data(struct intel_perf_context * perf_ctx,struct intel_perf_query_object * query,void * current_batch,int data_size,unsigned * data,unsigned * bytes_written)1556 intel_perf_get_query_data(struct intel_perf_context *perf_ctx,
1557                           struct intel_perf_query_object *query,
1558                           void *current_batch,
1559                           int data_size,
1560                           unsigned *data,
1561                           unsigned *bytes_written)
1562 {
1563    struct intel_perf_config *perf_cfg = perf_ctx->perf;
1564    int written = 0;
1565 
1566    switch (query->queryinfo->kind) {
1567    case INTEL_PERF_QUERY_TYPE_OA:
1568    case INTEL_PERF_QUERY_TYPE_RAW:
1569       if (!query->oa.results_accumulated) {
1570          /* Due to the sampling frequency of the OA buffer by the i915-perf
1571           * driver, there can be a 5ms delay between the Mesa seeing the query
1572           * complete and i915 making all the OA buffer reports available to us.
1573           * We need to wait for all the reports to come in before we can do
1574           * the post processing removing unrelated deltas.
1575           * There is a i915-perf series to address this issue, but it's
1576           * not been merged upstream yet.
1577           */
1578          while (!read_oa_samples_for_query(perf_ctx, query, current_batch))
1579             ;
1580 
1581          uint32_t *begin_report = query->oa.map;
1582          uint32_t *end_report = query->oa.map + perf_cfg->query_layout.size;
1583          intel_perf_query_result_accumulate_fields(&query->oa.result,
1584                                                  query->queryinfo,
1585                                                  begin_report,
1586                                                  end_report,
1587                                                  true /* no_oa_accumulate */);
1588          accumulate_oa_reports(perf_ctx, query);
1589          assert(query->oa.results_accumulated);
1590 
1591          perf_cfg->vtbl.bo_unmap(query->oa.bo);
1592          query->oa.map = NULL;
1593       }
1594       if (query->queryinfo->kind == INTEL_PERF_QUERY_TYPE_OA) {
1595          written = get_oa_counter_data(perf_ctx, query, data_size, (uint8_t *)data);
1596       } else {
1597          const struct intel_device_info *devinfo = perf_ctx->devinfo;
1598 
1599          written = intel_perf_query_result_write_mdapi((uint8_t *)data, data_size,
1600                                                      devinfo, query->queryinfo,
1601                                                      &query->oa.result);
1602       }
1603       break;
1604 
1605    case INTEL_PERF_QUERY_TYPE_PIPELINE:
1606       written = get_pipeline_stats_data(perf_ctx, query, data_size, (uint8_t *)data);
1607       break;
1608 
1609    default:
1610       unreachable("Unknown query type");
1611       break;
1612    }
1613 
1614    if (bytes_written)
1615       *bytes_written = written;
1616 }
1617 
1618 void
intel_perf_dump_query_count(struct intel_perf_context * perf_ctx)1619 intel_perf_dump_query_count(struct intel_perf_context *perf_ctx)
1620 {
1621    DBG("Queries: (Open queries = %d, OA users = %d)\n",
1622        perf_ctx->n_active_oa_queries, perf_ctx->n_oa_users);
1623 }
1624 
1625 void
intel_perf_dump_query(struct intel_perf_context * ctx,struct intel_perf_query_object * obj,void * current_batch)1626 intel_perf_dump_query(struct intel_perf_context *ctx,
1627                       struct intel_perf_query_object *obj,
1628                       void *current_batch)
1629 {
1630    switch (obj->queryinfo->kind) {
1631    case INTEL_PERF_QUERY_TYPE_OA:
1632    case INTEL_PERF_QUERY_TYPE_RAW:
1633       DBG("BO: %-4s OA data: %-10s %-15s\n",
1634           obj->oa.bo ? "yes," : "no,",
1635           intel_perf_is_query_ready(ctx, obj, current_batch) ? "ready," : "not ready,",
1636           obj->oa.results_accumulated ? "accumulated" : "not accumulated");
1637       break;
1638    case INTEL_PERF_QUERY_TYPE_PIPELINE:
1639       DBG("BO: %-4s\n",
1640           obj->pipeline_stats.bo ? "yes" : "no");
1641       break;
1642    default:
1643       unreachable("Unknown query type");
1644       break;
1645    }
1646 }
1647