1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file brw_performance_query.c
26 *
27 * Implementation of the GL_INTEL_performance_query extension.
28 *
29 * Currently there are two possible counter sources exposed here:
30 *
31 * On Gen6+ hardware we have numerous 64bit Pipeline Statistics Registers
32 * that we can snapshot at the beginning and end of a query.
33 *
34 * On Gen7.5+ we have Observability Architecture counters which are
35 * covered in separate document from the rest of the PRMs. It is available at:
36 * https://01.org/linuxgraphics/documentation/driver-documentation-prms
37 * => 2013 Intel Core Processor Family => Observability Performance Counters
38 * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell,
39 * though notably we currently only support OA counters for Haswell+)
40 */
41
42 #include <limits.h>
43 #include <dirent.h>
44
45 /* put before sys/types.h to silence glibc warnings */
46 #ifdef MAJOR_IN_MKDEV
47 #include <sys/mkdev.h>
48 #endif
49 #ifdef MAJOR_IN_SYSMACROS
50 #include <sys/sysmacros.h>
51 #endif
52 #include <sys/types.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #include <sys/mman.h>
56 #include <sys/ioctl.h>
57
58 #include <xf86drm.h>
59 #include <i915_drm.h>
60
61 #include "main/hash.h"
62 #include "main/macros.h"
63 #include "main/mtypes.h"
64 #include "main/performance_query.h"
65
66 #include "util/bitset.h"
67 #include "util/ralloc.h"
68 #include "util/hash_table.h"
69 #include "util/list.h"
70
71 #include "brw_context.h"
72 #include "brw_defines.h"
73 #include "brw_performance_query.h"
74 #include "brw_oa_hsw.h"
75 #include "brw_oa_bdw.h"
76 #include "brw_oa_chv.h"
77 #include "brw_oa_sklgt2.h"
78 #include "brw_oa_sklgt3.h"
79 #include "brw_oa_sklgt4.h"
80 #include "brw_oa_bxt.h"
81 #include "brw_oa_kblgt2.h"
82 #include "brw_oa_kblgt3.h"
83 #include "brw_oa_glk.h"
84 #include "brw_oa_cflgt2.h"
85 #include "brw_oa_cflgt3.h"
86 #include "intel_batchbuffer.h"
87
88 #define FILE_DEBUG_FLAG DEBUG_PERFMON
89
90 /*
91 * The largest OA formats we can use include:
92 * For Haswell:
93 * 1 timestamp, 45 A counters, 8 B counters and 8 C counters.
94 * For Gen8+
95 * 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
96 */
97 #define MAX_OA_REPORT_COUNTERS 62
98
99 #define OAREPORT_REASON_MASK 0x3f
100 #define OAREPORT_REASON_SHIFT 19
101 #define OAREPORT_REASON_TIMER (1<<0)
102 #define OAREPORT_REASON_TRIGGER1 (1<<1)
103 #define OAREPORT_REASON_TRIGGER2 (1<<2)
104 #define OAREPORT_REASON_CTX_SWITCH (1<<3)
105 #define OAREPORT_REASON_GO_TRANSITION (1<<4)
106
107 #define I915_PERF_OA_SAMPLE_SIZE (8 + /* drm_i915_perf_record_header */ \
108 256) /* OA counter report */
109
110 /**
111 * Periodic OA samples are read() into these buffer structures via the
112 * i915 perf kernel interface and appended to the
113 * brw->perfquery.sample_buffers linked list. When we process the
114 * results of an OA metrics query we need to consider all the periodic
115 * samples between the Begin and End MI_REPORT_PERF_COUNT command
116 * markers.
117 *
118 * 'Periodic' is a simplification as there are other automatic reports
119 * written by the hardware also buffered here.
120 *
121 * Considering three queries, A, B and C:
122 *
123 * Time ---->
124 * ________________A_________________
125 * | |
126 * | ________B_________ _____C___________
127 * | | | | | |
128 *
129 * And an illustration of sample buffers read over this time frame:
130 * [HEAD ][ ][ ][ ][ ][ ][ ][ ][TAIL ]
131 *
132 * These nodes may hold samples for query A:
133 * [ ][ ][ A ][ A ][ A ][ A ][ A ][ ][ ]
134 *
135 * These nodes may hold samples for query B:
136 * [ ][ ][ B ][ B ][ B ][ ][ ][ ][ ]
137 *
138 * These nodes may hold samples for query C:
139 * [ ][ ][ ][ ][ ][ C ][ C ][ C ][ ]
140 *
141 * The illustration assumes we have an even distribution of periodic
142 * samples so all nodes have the same size plotted against time:
143 *
144 * Note, to simplify code, the list is never empty.
145 *
146 * With overlapping queries we can see that periodic OA reports may
147 * relate to multiple queries and care needs to be take to keep
148 * track of sample buffers until there are no queries that might
149 * depend on their contents.
150 *
151 * We use a node ref counting system where a reference ensures that a
152 * node and all following nodes can't be freed/recycled until the
153 * reference drops to zero.
154 *
155 * E.g. with a ref of one here:
156 * [ 0 ][ 0 ][ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
157 *
158 * These nodes could be freed or recycled ("reaped"):
159 * [ 0 ][ 0 ]
160 *
161 * These must be preserved until the leading ref drops to zero:
162 * [ 1 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ][ 0 ]
163 *
164 * When a query starts we take a reference on the current tail of
165 * the list, knowing that no already-buffered samples can possibly
166 * relate to the newly-started query. A pointer to this node is
167 * also saved in the query object's ->oa.samples_head.
168 *
169 * E.g. starting query A while there are two nodes in .sample_buffers:
170 * ________________A________
171 * |
172 *
173 * [ 0 ][ 1 ]
174 * ^_______ Add a reference and store pointer to node in
175 * A->oa.samples_head
176 *
177 * Moving forward to when the B query starts with no new buffer nodes:
178 * (for reference, i915 perf reads() are only done when queries finish)
179 * ________________A_______
180 * | ________B___
181 * | |
182 *
183 * [ 0 ][ 2 ]
184 * ^_______ Add a reference and store pointer to
185 * node in B->oa.samples_head
186 *
187 * Once a query is finished, after an OA query has become 'Ready',
188 * once the End OA report has landed and after we we have processed
189 * all the intermediate periodic samples then we drop the
190 * ->oa.samples_head reference we took at the start.
191 *
192 * So when the B query has finished we have:
193 * ________________A________
194 * | ______B___________
195 * | | |
196 * [ 0 ][ 1 ][ 0 ][ 0 ][ 0 ]
197 * ^_______ Drop B->oa.samples_head reference
198 *
199 * We still can't free these due to the A->oa.samples_head ref:
200 * [ 1 ][ 0 ][ 0 ][ 0 ]
201 *
202 * When the A query finishes: (note there's a new ref for C's samples_head)
203 * ________________A_________________
204 * | |
205 * | _____C_________
206 * | | |
207 * [ 0 ][ 0 ][ 0 ][ 0 ][ 1 ][ 0 ][ 0 ]
208 * ^_______ Drop A->oa.samples_head reference
209 *
210 * And we can now reap these nodes up to the C->oa.samples_head:
211 * [ X ][ X ][ X ][ X ]
212 * keeping -> [ 1 ][ 0 ][ 0 ]
213 *
214 * We reap old sample buffers each time we finish processing an OA
215 * query by iterating the sample_buffers list from the head until we
216 * find a referenced node and stop.
217 *
218 * Reaped buffers move to a perfquery.free_sample_buffers list and
219 * when we come to read() we first look to recycle a buffer from the
220 * free_sample_buffers list before allocating a new buffer.
221 */
222 struct brw_oa_sample_buf {
223 struct exec_node link;
224 int refcount;
225 int len;
226 uint8_t buf[I915_PERF_OA_SAMPLE_SIZE * 10];
227 uint32_t last_timestamp;
228 };
229
230 /**
231 * i965 representation of a performance query object.
232 *
233 * NB: We want to keep this structure relatively lean considering that
234 * applications may expect to allocate enough objects to be able to
235 * query around all draw calls in a frame.
236 */
237 struct brw_perf_query_object
238 {
239 struct gl_perf_query_object base;
240
241 const struct brw_perf_query_info *query;
242
243 /* See query->kind to know which state below is in use... */
244 union {
245 struct {
246
247 /**
248 * BO containing OA counter snapshots at query Begin/End time.
249 */
250 struct brw_bo *bo;
251
252 /**
253 * Address of mapped of @bo
254 */
255 void *map;
256
257 /**
258 * The MI_REPORT_PERF_COUNT command lets us specify a unique
259 * ID that will be reflected in the resulting OA report
260 * that's written by the GPU. This is the ID we're expecting
261 * in the begin report and the the end report should be
262 * @begin_report_id + 1.
263 */
264 int begin_report_id;
265
266 /**
267 * Reference the head of the brw->perfquery.sample_buffers
268 * list at the time that the query started (so we only need
269 * to look at nodes after this point when looking for samples
270 * related to this query)
271 *
272 * (See struct brw_oa_sample_buf description for more details)
273 */
274 struct exec_node *samples_head;
275
276 /**
277 * Storage for the final accumulated OA counters.
278 */
279 uint64_t accumulator[MAX_OA_REPORT_COUNTERS];
280
281 /**
282 * false while in the unaccumulated_elements list, and set to
283 * true when the final, end MI_RPC snapshot has been
284 * accumulated.
285 */
286 bool results_accumulated;
287
288 } oa;
289
290 struct {
291 /**
292 * BO containing starting and ending snapshots for the
293 * statistics counters.
294 */
295 struct brw_bo *bo;
296 } pipeline_stats;
297 };
298 };
299
300 /** Downcasting convenience macro. */
301 static inline struct brw_perf_query_object *
brw_perf_query(struct gl_perf_query_object * o)302 brw_perf_query(struct gl_perf_query_object *o)
303 {
304 return (struct brw_perf_query_object *) o;
305 }
306
307 #define STATS_BO_SIZE 4096
308 #define STATS_BO_END_OFFSET_BYTES (STATS_BO_SIZE / 2)
309 #define MAX_STAT_COUNTERS (STATS_BO_END_OFFSET_BYTES / 8)
310
311 #define MI_RPC_BO_SIZE 4096
312 #define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2)
313
314 /******************************************************************************/
315
316 static bool
317 brw_is_perf_query_ready(struct gl_context *ctx,
318 struct gl_perf_query_object *o);
319
320 static void
dump_perf_query_callback(GLuint id,void * query_void,void * brw_void)321 dump_perf_query_callback(GLuint id, void *query_void, void *brw_void)
322 {
323 struct gl_context *ctx = brw_void;
324 struct gl_perf_query_object *o = query_void;
325 struct brw_perf_query_object *obj = query_void;
326
327 switch (obj->query->kind) {
328 case OA_COUNTERS:
329 DBG("%4d: %-6s %-8s BO: %-4s OA data: %-10s %-15s\n",
330 id,
331 o->Used ? "Dirty," : "New,",
332 o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
333 obj->oa.bo ? "yes," : "no,",
334 brw_is_perf_query_ready(ctx, o) ? "ready," : "not ready,",
335 obj->oa.results_accumulated ? "accumulated" : "not accumulated");
336 break;
337 case PIPELINE_STATS:
338 DBG("%4d: %-6s %-8s BO: %-4s\n",
339 id,
340 o->Used ? "Dirty," : "New,",
341 o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
342 obj->pipeline_stats.bo ? "yes" : "no");
343 break;
344 }
345 }
346
347 static void
dump_perf_queries(struct brw_context * brw)348 dump_perf_queries(struct brw_context *brw)
349 {
350 struct gl_context *ctx = &brw->ctx;
351 DBG("Queries: (Open queries = %d, OA users = %d)\n",
352 brw->perfquery.n_active_oa_queries, brw->perfquery.n_oa_users);
353 _mesa_HashWalk(ctx->PerfQuery.Objects, dump_perf_query_callback, brw);
354 }
355
356 /******************************************************************************/
357
358 static struct brw_oa_sample_buf *
get_free_sample_buf(struct brw_context * brw)359 get_free_sample_buf(struct brw_context *brw)
360 {
361 struct exec_node *node = exec_list_pop_head(&brw->perfquery.free_sample_buffers);
362 struct brw_oa_sample_buf *buf;
363
364 if (node)
365 buf = exec_node_data(struct brw_oa_sample_buf, node, link);
366 else {
367 buf = ralloc_size(brw, sizeof(*buf));
368
369 exec_node_init(&buf->link);
370 buf->refcount = 0;
371 buf->len = 0;
372 }
373
374 return buf;
375 }
376
377 static void
reap_old_sample_buffers(struct brw_context * brw)378 reap_old_sample_buffers(struct brw_context *brw)
379 {
380 struct exec_node *tail_node =
381 exec_list_get_tail(&brw->perfquery.sample_buffers);
382 struct brw_oa_sample_buf *tail_buf =
383 exec_node_data(struct brw_oa_sample_buf, tail_node, link);
384
385 /* Remove all old, unreferenced sample buffers walking forward from
386 * the head of the list, except always leave at least one node in
387 * the list so we always have a node to reference when we Begin
388 * a new query.
389 */
390 foreach_list_typed_safe(struct brw_oa_sample_buf, buf, link,
391 &brw->perfquery.sample_buffers)
392 {
393 if (buf->refcount == 0 && buf != tail_buf) {
394 exec_node_remove(&buf->link);
395 exec_list_push_head(&brw->perfquery.free_sample_buffers, &buf->link);
396 } else
397 return;
398 }
399 }
400
401 static void
free_sample_bufs(struct brw_context * brw)402 free_sample_bufs(struct brw_context *brw)
403 {
404 foreach_list_typed_safe(struct brw_oa_sample_buf, buf, link,
405 &brw->perfquery.free_sample_buffers)
406 ralloc_free(buf);
407
408 exec_list_make_empty(&brw->perfquery.free_sample_buffers);
409 }
410
411 /******************************************************************************/
412
413 /**
414 * Driver hook for glGetPerfQueryInfoINTEL().
415 */
416 static void
brw_get_perf_query_info(struct gl_context * ctx,unsigned query_index,const char ** name,GLuint * data_size,GLuint * n_counters,GLuint * n_active)417 brw_get_perf_query_info(struct gl_context *ctx,
418 unsigned query_index,
419 const char **name,
420 GLuint *data_size,
421 GLuint *n_counters,
422 GLuint *n_active)
423 {
424 struct brw_context *brw = brw_context(ctx);
425 const struct brw_perf_query_info *query =
426 &brw->perfquery.queries[query_index];
427
428 *name = query->name;
429 *data_size = query->data_size;
430 *n_counters = query->n_counters;
431
432 switch (query->kind) {
433 case OA_COUNTERS:
434 *n_active = brw->perfquery.n_active_oa_queries;
435 break;
436
437 case PIPELINE_STATS:
438 *n_active = brw->perfquery.n_active_pipeline_stats_queries;
439 break;
440 }
441 }
442
443 /**
444 * Driver hook for glGetPerfCounterInfoINTEL().
445 */
446 static void
brw_get_perf_counter_info(struct gl_context * ctx,unsigned query_index,unsigned counter_index,const char ** name,const char ** desc,GLuint * offset,GLuint * data_size,GLuint * type_enum,GLuint * data_type_enum,GLuint64 * raw_max)447 brw_get_perf_counter_info(struct gl_context *ctx,
448 unsigned query_index,
449 unsigned counter_index,
450 const char **name,
451 const char **desc,
452 GLuint *offset,
453 GLuint *data_size,
454 GLuint *type_enum,
455 GLuint *data_type_enum,
456 GLuint64 *raw_max)
457 {
458 struct brw_context *brw = brw_context(ctx);
459 const struct brw_perf_query_info *query =
460 &brw->perfquery.queries[query_index];
461 const struct brw_perf_query_counter *counter =
462 &query->counters[counter_index];
463
464 *name = counter->name;
465 *desc = counter->desc;
466 *offset = counter->offset;
467 *data_size = counter->size;
468 *type_enum = counter->type;
469 *data_type_enum = counter->data_type;
470 *raw_max = counter->raw_max;
471 }
472
473 /******************************************************************************/
474
475 /**
476 * Emit MI_STORE_REGISTER_MEM commands to capture all of the
477 * pipeline statistics for the performance query object.
478 */
479 static void
snapshot_statistics_registers(struct brw_context * brw,struct brw_perf_query_object * obj,uint32_t offset_in_bytes)480 snapshot_statistics_registers(struct brw_context *brw,
481 struct brw_perf_query_object *obj,
482 uint32_t offset_in_bytes)
483 {
484 const struct brw_perf_query_info *query = obj->query;
485 const int n_counters = query->n_counters;
486
487 for (int i = 0; i < n_counters; i++) {
488 const struct brw_perf_query_counter *counter = &query->counters[i];
489
490 assert(counter->data_type == GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL);
491
492 brw_store_register_mem64(brw, obj->pipeline_stats.bo,
493 counter->pipeline_stat.reg,
494 offset_in_bytes + i * sizeof(uint64_t));
495 }
496 }
497
498 /**
499 * Add a query to the global list of "unaccumulated queries."
500 *
501 * Queries are tracked here until all the associated OA reports have
502 * been accumulated via accumulate_oa_reports() after the end
503 * MI_REPORT_PERF_COUNT has landed in query->oa.bo.
504 */
505 static void
add_to_unaccumulated_query_list(struct brw_context * brw,struct brw_perf_query_object * obj)506 add_to_unaccumulated_query_list(struct brw_context *brw,
507 struct brw_perf_query_object *obj)
508 {
509 if (brw->perfquery.unaccumulated_elements >=
510 brw->perfquery.unaccumulated_array_size)
511 {
512 brw->perfquery.unaccumulated_array_size *= 1.5;
513 brw->perfquery.unaccumulated =
514 reralloc(brw, brw->perfquery.unaccumulated,
515 struct brw_perf_query_object *,
516 brw->perfquery.unaccumulated_array_size);
517 }
518
519 brw->perfquery.unaccumulated[brw->perfquery.unaccumulated_elements++] = obj;
520 }
521
522 /**
523 * Remove a query from the global list of unaccumulated queries once
524 * after successfully accumulating the OA reports associated with the
525 * query in accumulate_oa_reports() or when discarding unwanted query
526 * results.
527 */
528 static void
drop_from_unaccumulated_query_list(struct brw_context * brw,struct brw_perf_query_object * obj)529 drop_from_unaccumulated_query_list(struct brw_context *brw,
530 struct brw_perf_query_object *obj)
531 {
532 for (int i = 0; i < brw->perfquery.unaccumulated_elements; i++) {
533 if (brw->perfquery.unaccumulated[i] == obj) {
534 int last_elt = --brw->perfquery.unaccumulated_elements;
535
536 if (i == last_elt)
537 brw->perfquery.unaccumulated[i] = NULL;
538 else {
539 brw->perfquery.unaccumulated[i] =
540 brw->perfquery.unaccumulated[last_elt];
541 }
542
543 break;
544 }
545 }
546
547 /* Drop our samples_head reference so that associated periodic
548 * sample data buffers can potentially be reaped if they aren't
549 * referenced by any other queries...
550 */
551
552 struct brw_oa_sample_buf *buf =
553 exec_node_data(struct brw_oa_sample_buf, obj->oa.samples_head, link);
554
555 assert(buf->refcount > 0);
556 buf->refcount--;
557
558 obj->oa.samples_head = NULL;
559
560 reap_old_sample_buffers(brw);
561 }
562
563 static uint64_t
timebase_scale(struct brw_context * brw,uint32_t u32_time_delta)564 timebase_scale(struct brw_context *brw, uint32_t u32_time_delta)
565 {
566 const struct gen_device_info *devinfo = &brw->screen->devinfo;
567 uint64_t tmp = ((uint64_t)u32_time_delta) * 1000000000ull;
568
569 return tmp ? tmp / devinfo->timestamp_frequency : 0;
570 }
571
572 static void
accumulate_uint32(const uint32_t * report0,const uint32_t * report1,uint64_t * accumulator)573 accumulate_uint32(const uint32_t *report0,
574 const uint32_t *report1,
575 uint64_t *accumulator)
576 {
577 *accumulator += (uint32_t)(*report1 - *report0);
578 }
579
580 static void
accumulate_uint40(int a_index,const uint32_t * report0,const uint32_t * report1,uint64_t * accumulator)581 accumulate_uint40(int a_index,
582 const uint32_t *report0,
583 const uint32_t *report1,
584 uint64_t *accumulator)
585 {
586 const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
587 const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
588 uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
589 uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
590 uint64_t value0 = report0[a_index + 4] | high0;
591 uint64_t value1 = report1[a_index + 4] | high1;
592 uint64_t delta;
593
594 if (value0 > value1)
595 delta = (1ULL << 40) + value1 - value0;
596 else
597 delta = value1 - value0;
598
599 *accumulator += delta;
600 }
601
602 /**
603 * Given pointers to starting and ending OA snapshots, add the deltas for each
604 * counter to the results.
605 */
606 static void
add_deltas(struct brw_context * brw,struct brw_perf_query_object * obj,const uint32_t * start,const uint32_t * end)607 add_deltas(struct brw_context *brw,
608 struct brw_perf_query_object *obj,
609 const uint32_t *start,
610 const uint32_t *end)
611 {
612 const struct brw_perf_query_info *query = obj->query;
613 uint64_t *accumulator = obj->oa.accumulator;
614 int idx = 0;
615 int i;
616
617 switch (query->oa_format) {
618 case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
619 accumulate_uint32(start + 1, end + 1, accumulator + idx++); /* timestamp */
620 accumulate_uint32(start + 3, end + 3, accumulator + idx++); /* clock */
621
622 /* 32x 40bit A counters... */
623 for (i = 0; i < 32; i++)
624 accumulate_uint40(i, start, end, accumulator + idx++);
625
626 /* 4x 32bit A counters... */
627 for (i = 0; i < 4; i++)
628 accumulate_uint32(start + 36 + i, end + 36 + i, accumulator + idx++);
629
630 /* 8x 32bit B counters + 8x 32bit C counters... */
631 for (i = 0; i < 16; i++)
632 accumulate_uint32(start + 48 + i, end + 48 + i, accumulator + idx++);
633
634 break;
635 case I915_OA_FORMAT_A45_B8_C8:
636 accumulate_uint32(start + 1, end + 1, accumulator); /* timestamp */
637
638 for (i = 0; i < 61; i++)
639 accumulate_uint32(start + 3 + i, end + 3 + i, accumulator + 1 + i);
640
641 break;
642 default:
643 unreachable("Can't accumulate OA counters in unknown format");
644 }
645 }
646
647 static bool
inc_n_oa_users(struct brw_context * brw)648 inc_n_oa_users(struct brw_context *brw)
649 {
650 if (brw->perfquery.n_oa_users == 0 &&
651 drmIoctl(brw->perfquery.oa_stream_fd,
652 I915_PERF_IOCTL_ENABLE, 0) < 0)
653 {
654 return false;
655 }
656 ++brw->perfquery.n_oa_users;
657
658 return true;
659 }
660
661 static void
dec_n_oa_users(struct brw_context * brw)662 dec_n_oa_users(struct brw_context *brw)
663 {
664 /* Disabling the i915 perf stream will effectively disable the OA
665 * counters. Note it's important to be sure there are no outstanding
666 * MI_RPC commands at this point since they could stall the CS
667 * indefinitely once OACONTROL is disabled.
668 */
669 --brw->perfquery.n_oa_users;
670 if (brw->perfquery.n_oa_users == 0 &&
671 drmIoctl(brw->perfquery.oa_stream_fd, I915_PERF_IOCTL_DISABLE, 0) < 0)
672 {
673 DBG("WARNING: Error disabling i915 perf stream: %m\n");
674 }
675 }
676
677 /* In general if we see anything spurious while accumulating results,
678 * we don't try and continue accumulating the current query, hoping
679 * for the best, we scrap anything outstanding, and then hope for the
680 * best with new queries.
681 */
682 static void
discard_all_queries(struct brw_context * brw)683 discard_all_queries(struct brw_context *brw)
684 {
685 while (brw->perfquery.unaccumulated_elements) {
686 struct brw_perf_query_object *obj = brw->perfquery.unaccumulated[0];
687
688 obj->oa.results_accumulated = true;
689 drop_from_unaccumulated_query_list(brw, brw->perfquery.unaccumulated[0]);
690
691 dec_n_oa_users(brw);
692 }
693 }
694
695 enum OaReadStatus {
696 OA_READ_STATUS_ERROR,
697 OA_READ_STATUS_UNFINISHED,
698 OA_READ_STATUS_FINISHED,
699 };
700
701 static enum OaReadStatus
read_oa_samples_until(struct brw_context * brw,uint32_t start_timestamp,uint32_t end_timestamp)702 read_oa_samples_until(struct brw_context *brw,
703 uint32_t start_timestamp,
704 uint32_t end_timestamp)
705 {
706 struct exec_node *tail_node =
707 exec_list_get_tail(&brw->perfquery.sample_buffers);
708 struct brw_oa_sample_buf *tail_buf =
709 exec_node_data(struct brw_oa_sample_buf, tail_node, link);
710 uint32_t last_timestamp = tail_buf->last_timestamp;
711
712 while (1) {
713 struct brw_oa_sample_buf *buf = get_free_sample_buf(brw);
714 uint32_t offset;
715 int len;
716
717 while ((len = read(brw->perfquery.oa_stream_fd, buf->buf,
718 sizeof(buf->buf))) < 0 && errno == EINTR)
719 ;
720
721 if (len <= 0) {
722 exec_list_push_tail(&brw->perfquery.free_sample_buffers, &buf->link);
723
724 if (len < 0) {
725 if (errno == EAGAIN)
726 return ((last_timestamp - start_timestamp) >=
727 (end_timestamp - start_timestamp)) ?
728 OA_READ_STATUS_FINISHED :
729 OA_READ_STATUS_UNFINISHED;
730 else {
731 DBG("Error reading i915 perf samples: %m\n");
732 }
733 } else
734 DBG("Spurious EOF reading i915 perf samples\n");
735
736 return OA_READ_STATUS_ERROR;
737 }
738
739 buf->len = len;
740 exec_list_push_tail(&brw->perfquery.sample_buffers, &buf->link);
741
742 /* Go through the reports and update the last timestamp. */
743 offset = 0;
744 while (offset < buf->len) {
745 const struct drm_i915_perf_record_header *header =
746 (const struct drm_i915_perf_record_header *) &buf->buf[offset];
747 uint32_t *report = (uint32_t *) (header + 1);
748
749 if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
750 last_timestamp = report[1];
751
752 offset += header->size;
753 }
754
755 buf->last_timestamp = last_timestamp;
756 }
757
758 unreachable("not reached");
759 return OA_READ_STATUS_ERROR;
760 }
761
762 /**
763 * Try to read all the reports until either the delimiting timestamp
764 * or an error arises.
765 */
766 static bool
read_oa_samples_for_query(struct brw_context * brw,struct brw_perf_query_object * obj)767 read_oa_samples_for_query(struct brw_context *brw,
768 struct brw_perf_query_object *obj)
769 {
770 uint32_t *start;
771 uint32_t *last;
772 uint32_t *end;
773
774 /* We need the MI_REPORT_PERF_COUNT to land before we can start
775 * accumulate. */
776 assert(!brw_batch_references(&brw->batch, obj->oa.bo) &&
777 !brw_bo_busy(obj->oa.bo));
778
779 /* Map the BO once here and let accumulate_oa_reports() unmap
780 * it. */
781 if (obj->oa.map == NULL)
782 obj->oa.map = brw_bo_map(brw, obj->oa.bo, MAP_READ);
783
784 start = last = obj->oa.map;
785 end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
786
787 if (start[0] != obj->oa.begin_report_id) {
788 DBG("Spurious start report id=%"PRIu32"\n", start[0]);
789 return true;
790 }
791 if (end[0] != (obj->oa.begin_report_id + 1)) {
792 DBG("Spurious end report id=%"PRIu32"\n", end[0]);
793 return true;
794 }
795
796 /* Read the reports until the end timestamp. */
797 switch (read_oa_samples_until(brw, start[1], end[1])) {
798 case OA_READ_STATUS_ERROR:
799 /* Fallthrough and let accumulate_oa_reports() deal with the
800 * error. */
801 case OA_READ_STATUS_FINISHED:
802 return true;
803 case OA_READ_STATUS_UNFINISHED:
804 return false;
805 }
806
807 unreachable("invalid read status");
808 return false;
809 }
810
811 /**
812 * Accumulate raw OA counter values based on deltas between pairs of
813 * OA reports.
814 *
815 * Accumulation starts from the first report captured via
816 * MI_REPORT_PERF_COUNT (MI_RPC) by brw_begin_perf_query() until the
817 * last MI_RPC report requested by brw_end_perf_query(). Between these
818 * two reports there may also some number of periodically sampled OA
819 * reports collected via the i915 perf interface - depending on the
820 * duration of the query.
821 *
822 * These periodic snapshots help to ensure we handle counter overflow
823 * correctly by being frequent enough to ensure we don't miss multiple
824 * overflows of a counter between snapshots. For Gen8+ the i915 perf
825 * snapshots provide the extra context-switch reports that let us
826 * subtract out the progress of counters associated with other
827 * contexts running on the system.
828 */
829 static void
accumulate_oa_reports(struct brw_context * brw,struct brw_perf_query_object * obj)830 accumulate_oa_reports(struct brw_context *brw,
831 struct brw_perf_query_object *obj)
832 {
833 const struct gen_device_info *devinfo = &brw->screen->devinfo;
834 struct gl_perf_query_object *o = &obj->base;
835 uint32_t *start;
836 uint32_t *last;
837 uint32_t *end;
838 struct exec_node *first_samples_node;
839 bool in_ctx = true;
840 uint32_t ctx_id;
841 int out_duration = 0;
842
843 assert(o->Ready);
844 assert(obj->oa.map != NULL);
845
846 start = last = obj->oa.map;
847 end = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
848
849 if (start[0] != obj->oa.begin_report_id) {
850 DBG("Spurious start report id=%"PRIu32"\n", start[0]);
851 goto error;
852 }
853 if (end[0] != (obj->oa.begin_report_id + 1)) {
854 DBG("Spurious end report id=%"PRIu32"\n", end[0]);
855 goto error;
856 }
857
858 ctx_id = start[2];
859
860 /* See if we have any periodic reports to accumulate too... */
861
862 /* N.B. The oa.samples_head was set when the query began and
863 * pointed to the tail of the brw->perfquery.sample_buffers list at
864 * the time the query started. Since the buffer existed before the
865 * first MI_REPORT_PERF_COUNT command was emitted we therefore know
866 * that no data in this particular node's buffer can possibly be
867 * associated with the query - so skip ahead one...
868 */
869 first_samples_node = obj->oa.samples_head->next;
870
871 foreach_list_typed_from(struct brw_oa_sample_buf, buf, link,
872 &brw->perfquery.sample_buffers,
873 first_samples_node)
874 {
875 int offset = 0;
876
877 while (offset < buf->len) {
878 const struct drm_i915_perf_record_header *header =
879 (const struct drm_i915_perf_record_header *)(buf->buf + offset);
880
881 assert(header->size != 0);
882 assert(header->size <= buf->len);
883
884 offset += header->size;
885
886 switch (header->type) {
887 case DRM_I915_PERF_RECORD_SAMPLE: {
888 uint32_t *report = (uint32_t *)(header + 1);
889 bool add = true;
890
891 /* Ignore reports that come before the start marker.
892 * (Note: takes care to allow overflow of 32bit timestamps)
893 */
894 if (timebase_scale(brw, report[1] - start[1]) > 5000000000)
895 continue;
896
897 /* Ignore reports that come after the end marker.
898 * (Note: takes care to allow overflow of 32bit timestamps)
899 */
900 if (timebase_scale(brw, report[1] - end[1]) <= 5000000000)
901 goto end;
902
903 /* For Gen8+ since the counters continue while other
904 * contexts are running we need to discount any unrelated
905 * deltas. The hardware automatically generates a report
906 * on context switch which gives us a new reference point
907 * to continuing adding deltas from.
908 *
909 * For Haswell we can rely on the HW to stop the progress
910 * of OA counters while any other context is acctive.
911 */
912 if (devinfo->gen >= 8) {
913 if (in_ctx && report[2] != ctx_id) {
914 DBG("i915 perf: Switch AWAY (observed by ID change)\n");
915 in_ctx = false;
916 out_duration = 0;
917 } else if (in_ctx == false && report[2] == ctx_id) {
918 DBG("i915 perf: Switch TO\n");
919 in_ctx = true;
920
921 /* From experimentation in IGT, we found that the OA unit
922 * might label some report as "idle" (using an invalid
923 * context ID), right after a report for a given context.
924 * Deltas generated by those reports actually belong to the
925 * previous context, even though they're not labelled as
926 * such.
927 *
928 * We didn't *really* Switch AWAY in the case that we e.g.
929 * saw a single periodic report while idle...
930 */
931 if (out_duration >= 1)
932 add = false;
933 } else if (in_ctx) {
934 assert(report[2] == ctx_id);
935 DBG("i915 perf: Continuation IN\n");
936 } else {
937 assert(report[2] != ctx_id);
938 DBG("i915 perf: Continuation OUT\n");
939 add = false;
940 out_duration++;
941 }
942 }
943
944 if (add)
945 add_deltas(brw, obj, last, report);
946
947 last = report;
948
949 break;
950 }
951
952 case DRM_I915_PERF_RECORD_OA_BUFFER_LOST:
953 DBG("i915 perf: OA error: all reports lost\n");
954 goto error;
955 case DRM_I915_PERF_RECORD_OA_REPORT_LOST:
956 DBG("i915 perf: OA report lost\n");
957 break;
958 }
959 }
960 }
961
962 end:
963
964 add_deltas(brw, obj, last, end);
965
966 DBG("Marking %d accumulated - results gathered\n", o->Id);
967
968 brw_bo_unmap(obj->oa.bo);
969 obj->oa.map = NULL;
970 obj->oa.results_accumulated = true;
971 drop_from_unaccumulated_query_list(brw, obj);
972 dec_n_oa_users(brw);
973
974 return;
975
976 error:
977
978 brw_bo_unmap(obj->oa.bo);
979 obj->oa.map = NULL;
980 discard_all_queries(brw);
981 }
982
983 /******************************************************************************/
984
985 static bool
open_i915_perf_oa_stream(struct brw_context * brw,int metrics_set_id,int report_format,int period_exponent,int drm_fd,uint32_t ctx_id)986 open_i915_perf_oa_stream(struct brw_context *brw,
987 int metrics_set_id,
988 int report_format,
989 int period_exponent,
990 int drm_fd,
991 uint32_t ctx_id)
992 {
993 uint64_t properties[] = {
994 /* Single context sampling */
995 DRM_I915_PERF_PROP_CTX_HANDLE, ctx_id,
996
997 /* Include OA reports in samples */
998 DRM_I915_PERF_PROP_SAMPLE_OA, true,
999
1000 /* OA unit configuration */
1001 DRM_I915_PERF_PROP_OA_METRICS_SET, metrics_set_id,
1002 DRM_I915_PERF_PROP_OA_FORMAT, report_format,
1003 DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent,
1004 };
1005 struct drm_i915_perf_open_param param = {
1006 .flags = I915_PERF_FLAG_FD_CLOEXEC |
1007 I915_PERF_FLAG_FD_NONBLOCK |
1008 I915_PERF_FLAG_DISABLED,
1009 .num_properties = ARRAY_SIZE(properties) / 2,
1010 .properties_ptr = (uintptr_t) properties,
1011 };
1012 int fd = drmIoctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, ¶m);
1013 if (fd == -1) {
1014 DBG("Error opening i915 perf OA stream: %m\n");
1015 return false;
1016 }
1017
1018 brw->perfquery.oa_stream_fd = fd;
1019
1020 brw->perfquery.current_oa_metrics_set_id = metrics_set_id;
1021 brw->perfquery.current_oa_format = report_format;
1022
1023 return true;
1024 }
1025
1026 static void
close_perf(struct brw_context * brw)1027 close_perf(struct brw_context *brw)
1028 {
1029 if (brw->perfquery.oa_stream_fd != -1) {
1030 close(brw->perfquery.oa_stream_fd);
1031 brw->perfquery.oa_stream_fd = -1;
1032 }
1033 }
1034
1035 /**
1036 * Driver hook for glBeginPerfQueryINTEL().
1037 */
1038 static bool
brw_begin_perf_query(struct gl_context * ctx,struct gl_perf_query_object * o)1039 brw_begin_perf_query(struct gl_context *ctx,
1040 struct gl_perf_query_object *o)
1041 {
1042 struct brw_context *brw = brw_context(ctx);
1043 struct brw_perf_query_object *obj = brw_perf_query(o);
1044 const struct brw_perf_query_info *query = obj->query;
1045
1046 /* We can assume the frontend hides mistaken attempts to Begin a
1047 * query object multiple times before its End. Similarly if an
1048 * application reuses a query object before results have arrived
1049 * the frontend will wait for prior results so we don't need
1050 * to support abandoning in-flight results.
1051 */
1052 assert(!o->Active);
1053 assert(!o->Used || o->Ready); /* no in-flight query to worry about */
1054
1055 DBG("Begin(%d)\n", o->Id);
1056
1057 /* XXX: We have to consider that the command parser unit that parses batch
1058 * buffer commands and is used to capture begin/end counter snapshots isn't
1059 * implicitly synchronized with what's currently running across other GPU
1060 * units (such as the EUs running shaders) that the performance counters are
1061 * associated with.
1062 *
1063 * The intention of performance queries is to measure the work associated
1064 * with commands between the begin/end delimiters and so for that to be the
1065 * case we need to explicitly synchronize the parsing of commands to capture
1066 * Begin/End counter snapshots with what's running across other parts of the
1067 * GPU.
1068 *
1069 * When the command parser reaches a Begin marker it effectively needs to
1070 * drain everything currently running on the GPU until the hardware is idle
1071 * before capturing the first snapshot of counters - otherwise the results
1072 * would also be measuring the effects of earlier commands.
1073 *
1074 * When the command parser reaches an End marker it needs to stall until
1075 * everything currently running on the GPU has finished before capturing the
1076 * end snapshot - otherwise the results won't be a complete representation
1077 * of the work.
1078 *
1079 * Theoretically there could be opportunities to minimize how much of the
1080 * GPU pipeline is drained, or that we stall for, when we know what specific
1081 * units the performance counters being queried relate to but we don't
1082 * currently attempt to be clever here.
1083 *
1084 * Note: with our current simple approach here then for back-to-back queries
1085 * we will redundantly emit duplicate commands to synchronize the command
1086 * streamer with the rest of the GPU pipeline, but we assume that in HW the
1087 * second synchronization is effectively a NOOP.
1088 *
1089 * N.B. The final results are based on deltas of counters between (inside)
1090 * Begin/End markers so even though the total wall clock time of the
1091 * workload is stretched by larger pipeline bubbles the bubbles themselves
1092 * are generally invisible to the query results. Whether that's a good or a
1093 * bad thing depends on the use case. For a lower real-time impact while
1094 * capturing metrics then periodic sampling may be a better choice than
1095 * INTEL_performance_query.
1096 *
1097 *
1098 * This is our Begin synchronization point to drain current work on the
1099 * GPU before we capture our first counter snapshot...
1100 */
1101 brw_emit_mi_flush(brw);
1102
1103 switch (query->kind) {
1104 case OA_COUNTERS:
1105
1106 /* Opening an i915 perf stream implies exclusive access to the OA unit
1107 * which will generate counter reports for a specific counter set with a
1108 * specific layout/format so we can't begin any OA based queries that
1109 * require a different counter set or format unless we get an opportunity
1110 * to close the stream and open a new one...
1111 */
1112 if (brw->perfquery.oa_stream_fd != -1 &&
1113 brw->perfquery.current_oa_metrics_set_id !=
1114 query->oa_metrics_set_id) {
1115
1116 if (brw->perfquery.n_oa_users != 0)
1117 return false;
1118 else
1119 close_perf(brw);
1120 }
1121
1122 /* If the OA counters aren't already on, enable them. */
1123 if (brw->perfquery.oa_stream_fd == -1) {
1124 __DRIscreen *screen = brw->screen->driScrnPriv;
1125 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1126
1127 /* The period_exponent gives a sampling period as follows:
1128 * sample_period = timestamp_period * 2^(period_exponent + 1)
1129 *
1130 * The timestamps increments every 80ns (HSW), ~52ns (GEN9LP) or
1131 * ~83ns (GEN8/9).
1132 *
1133 * The counter overflow period is derived from the EuActive counter
1134 * which reads a counter that increments by the number of clock
1135 * cycles multiplied by the number of EUs. It can be calculated as:
1136 *
1137 * 2^(number of bits in A counter) / (n_eus * max_gen_freq * 2)
1138 *
1139 * (E.g. 40 EUs @ 1GHz = ~53ms)
1140 *
1141 * We select a sampling period inferior to that overflow period to
1142 * ensure we cannot see more than 1 counter overflow, otherwise we
1143 * could loose information.
1144 */
1145
1146 int a_counter_in_bits = 32;
1147 if (devinfo->gen >= 8)
1148 a_counter_in_bits = 40;
1149
1150 uint64_t overflow_period = pow(2, a_counter_in_bits) /
1151 (brw->perfquery.sys_vars.n_eus *
1152 /* drop 1GHz freq to have units in nanoseconds */
1153 2);
1154
1155 DBG("A counter overflow period: %"PRIu64"ns, %"PRIu64"ms (n_eus=%"PRIu64")\n",
1156 overflow_period, overflow_period / 1000000ul, brw->perfquery.sys_vars.n_eus);
1157
1158 int period_exponent = 0;
1159 uint64_t prev_sample_period, next_sample_period;
1160 for (int e = 0; e < 30; e++) {
1161 prev_sample_period = 1000000000ull * pow(2, e + 1) / devinfo->timestamp_frequency;
1162 next_sample_period = 1000000000ull * pow(2, e + 2) / devinfo->timestamp_frequency;
1163
1164 /* Take the previous sampling period, lower than the overflow
1165 * period.
1166 */
1167 if (prev_sample_period < overflow_period &&
1168 next_sample_period > overflow_period)
1169 period_exponent = e + 1;
1170 }
1171
1172 if (period_exponent == 0) {
1173 DBG("WARNING: enable to find a sampling exponent\n");
1174 return false;
1175 }
1176
1177 DBG("OA sampling exponent: %i ~= %"PRIu64"ms\n", period_exponent,
1178 prev_sample_period / 1000000ul);
1179
1180 if (!open_i915_perf_oa_stream(brw,
1181 query->oa_metrics_set_id,
1182 query->oa_format,
1183 period_exponent,
1184 screen->fd, /* drm fd */
1185 brw->hw_ctx))
1186 return false;
1187 } else {
1188 assert(brw->perfquery.current_oa_metrics_set_id ==
1189 query->oa_metrics_set_id &&
1190 brw->perfquery.current_oa_format ==
1191 query->oa_format);
1192 }
1193
1194 if (!inc_n_oa_users(brw)) {
1195 DBG("WARNING: Error enabling i915 perf stream: %m\n");
1196 return false;
1197 }
1198
1199 if (obj->oa.bo) {
1200 brw_bo_unreference(obj->oa.bo);
1201 obj->oa.bo = NULL;
1202 }
1203
1204 obj->oa.bo =
1205 brw_bo_alloc(brw->bufmgr, "perf. query OA MI_RPC bo",
1206 MI_RPC_BO_SIZE, 64);
1207 #ifdef DEBUG
1208 /* Pre-filling the BO helps debug whether writes landed. */
1209 void *map = brw_bo_map(brw, obj->oa.bo, MAP_WRITE);
1210 memset(map, 0x80, MI_RPC_BO_SIZE);
1211 brw_bo_unmap(obj->oa.bo);
1212 #endif
1213
1214 obj->oa.begin_report_id = brw->perfquery.next_query_start_report_id;
1215 brw->perfquery.next_query_start_report_id += 2;
1216
1217 /* We flush the batchbuffer here to minimize the chances that MI_RPC
1218 * delimiting commands end up in different batchbuffers. If that's the
1219 * case, the measurement will include the time it takes for the kernel
1220 * scheduler to load a new request into the hardware. This is manifested in
1221 * tools like frameretrace by spikes in the "GPU Core Clocks" counter.
1222 */
1223 intel_batchbuffer_flush(brw);
1224
1225 /* Take a starting OA counter snapshot. */
1226 brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 0,
1227 obj->oa.begin_report_id);
1228 ++brw->perfquery.n_active_oa_queries;
1229
1230 /* No already-buffered samples can possibly be associated with this query
1231 * so create a marker within the list of sample buffers enabling us to
1232 * easily ignore earlier samples when processing this query after
1233 * completion.
1234 */
1235 assert(!exec_list_is_empty(&brw->perfquery.sample_buffers));
1236 obj->oa.samples_head = exec_list_get_tail(&brw->perfquery.sample_buffers);
1237
1238 struct brw_oa_sample_buf *buf =
1239 exec_node_data(struct brw_oa_sample_buf, obj->oa.samples_head, link);
1240
1241 /* This reference will ensure that future/following sample
1242 * buffers (that may relate to this query) can't be freed until
1243 * this drops to zero.
1244 */
1245 buf->refcount++;
1246
1247 memset(obj->oa.accumulator, 0, sizeof(obj->oa.accumulator));
1248 obj->oa.results_accumulated = false;
1249
1250 add_to_unaccumulated_query_list(brw, obj);
1251 break;
1252
1253 case PIPELINE_STATS:
1254 if (obj->pipeline_stats.bo) {
1255 brw_bo_unreference(obj->pipeline_stats.bo);
1256 obj->pipeline_stats.bo = NULL;
1257 }
1258
1259 obj->pipeline_stats.bo =
1260 brw_bo_alloc(brw->bufmgr, "perf. query pipeline stats bo",
1261 STATS_BO_SIZE, 64);
1262
1263 /* Take starting snapshots. */
1264 snapshot_statistics_registers(brw, obj, 0);
1265
1266 ++brw->perfquery.n_active_pipeline_stats_queries;
1267 break;
1268 }
1269
1270 if (INTEL_DEBUG & DEBUG_PERFMON)
1271 dump_perf_queries(brw);
1272
1273 return true;
1274 }
1275
1276 /**
1277 * Driver hook for glEndPerfQueryINTEL().
1278 */
1279 static void
brw_end_perf_query(struct gl_context * ctx,struct gl_perf_query_object * o)1280 brw_end_perf_query(struct gl_context *ctx,
1281 struct gl_perf_query_object *o)
1282 {
1283 struct brw_context *brw = brw_context(ctx);
1284 struct brw_perf_query_object *obj = brw_perf_query(o);
1285
1286 DBG("End(%d)\n", o->Id);
1287
1288 /* Ensure that the work associated with the queried commands will have
1289 * finished before taking our query end counter readings.
1290 *
1291 * For more details see comment in brw_begin_perf_query for
1292 * corresponding flush.
1293 */
1294 brw_emit_mi_flush(brw);
1295
1296 switch (obj->query->kind) {
1297 case OA_COUNTERS:
1298
1299 /* NB: It's possible that the query will have already been marked
1300 * as 'accumulated' if an error was seen while reading samples
1301 * from perf. In this case we mustn't try and emit a closing
1302 * MI_RPC command in case the OA unit has already been disabled
1303 */
1304 if (!obj->oa.results_accumulated) {
1305 /* Take an ending OA counter snapshot. */
1306 brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo,
1307 MI_RPC_BO_END_OFFSET_BYTES,
1308 obj->oa.begin_report_id + 1);
1309 }
1310
1311 --brw->perfquery.n_active_oa_queries;
1312
1313 /* NB: even though the query has now ended, it can't be accumulated
1314 * until the end MI_REPORT_PERF_COUNT snapshot has been written
1315 * to query->oa.bo
1316 */
1317 break;
1318
1319 case PIPELINE_STATS:
1320 snapshot_statistics_registers(brw, obj,
1321 STATS_BO_END_OFFSET_BYTES);
1322 --brw->perfquery.n_active_pipeline_stats_queries;
1323 break;
1324 }
1325 }
1326
1327 static void
brw_wait_perf_query(struct gl_context * ctx,struct gl_perf_query_object * o)1328 brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
1329 {
1330 struct brw_context *brw = brw_context(ctx);
1331 struct brw_perf_query_object *obj = brw_perf_query(o);
1332 struct brw_bo *bo = NULL;
1333
1334 assert(!o->Ready);
1335
1336 switch (obj->query->kind) {
1337 case OA_COUNTERS:
1338 bo = obj->oa.bo;
1339 break;
1340
1341 case PIPELINE_STATS:
1342 bo = obj->pipeline_stats.bo;
1343 break;
1344 }
1345
1346 if (bo == NULL)
1347 return;
1348
1349 /* If the current batch references our results bo then we need to
1350 * flush first...
1351 */
1352 if (brw_batch_references(&brw->batch, bo))
1353 intel_batchbuffer_flush(brw);
1354
1355 brw_bo_wait_rendering(bo);
1356
1357 /* Due to a race condition between the OA unit signaling report
1358 * availability and the report actually being written into memory,
1359 * we need to wait for all the reports to come in before we can
1360 * read them.
1361 */
1362 if (obj->query->kind == OA_COUNTERS) {
1363 while (!read_oa_samples_for_query(brw, obj))
1364 ;
1365 }
1366 }
1367
1368 static bool
brw_is_perf_query_ready(struct gl_context * ctx,struct gl_perf_query_object * o)1369 brw_is_perf_query_ready(struct gl_context *ctx,
1370 struct gl_perf_query_object *o)
1371 {
1372 struct brw_context *brw = brw_context(ctx);
1373 struct brw_perf_query_object *obj = brw_perf_query(o);
1374
1375 if (o->Ready)
1376 return true;
1377
1378 switch (obj->query->kind) {
1379 case OA_COUNTERS:
1380 return (obj->oa.results_accumulated ||
1381 (obj->oa.bo &&
1382 !brw_batch_references(&brw->batch, obj->oa.bo) &&
1383 !brw_bo_busy(obj->oa.bo) &&
1384 read_oa_samples_for_query(brw, obj)));
1385 case PIPELINE_STATS:
1386 return (obj->pipeline_stats.bo &&
1387 !brw_batch_references(&brw->batch, obj->pipeline_stats.bo) &&
1388 !brw_bo_busy(obj->pipeline_stats.bo));
1389 }
1390
1391 unreachable("missing ready check for unknown query kind");
1392 return false;
1393 }
1394
1395 static int
get_oa_counter_data(struct brw_context * brw,struct brw_perf_query_object * obj,size_t data_size,uint8_t * data)1396 get_oa_counter_data(struct brw_context *brw,
1397 struct brw_perf_query_object *obj,
1398 size_t data_size,
1399 uint8_t *data)
1400 {
1401 const struct brw_perf_query_info *query = obj->query;
1402 int n_counters = query->n_counters;
1403 int written = 0;
1404
1405 if (!obj->oa.results_accumulated) {
1406 accumulate_oa_reports(brw, obj);
1407 assert(obj->oa.results_accumulated);
1408 }
1409
1410 for (int i = 0; i < n_counters; i++) {
1411 const struct brw_perf_query_counter *counter = &query->counters[i];
1412 uint64_t *out_uint64;
1413 float *out_float;
1414
1415 if (counter->size) {
1416 switch (counter->data_type) {
1417 case GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL:
1418 out_uint64 = (uint64_t *)(data + counter->offset);
1419 *out_uint64 = counter->oa_counter_read_uint64(brw, query,
1420 obj->oa.accumulator);
1421 break;
1422 case GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL:
1423 out_float = (float *)(data + counter->offset);
1424 *out_float = counter->oa_counter_read_float(brw, query,
1425 obj->oa.accumulator);
1426 break;
1427 default:
1428 /* So far we aren't using uint32, double or bool32... */
1429 unreachable("unexpected counter data type");
1430 }
1431 written = counter->offset + counter->size;
1432 }
1433 }
1434
1435 return written;
1436 }
1437
1438 static int
get_pipeline_stats_data(struct brw_context * brw,struct brw_perf_query_object * obj,size_t data_size,uint8_t * data)1439 get_pipeline_stats_data(struct brw_context *brw,
1440 struct brw_perf_query_object *obj,
1441 size_t data_size,
1442 uint8_t *data)
1443
1444 {
1445 const struct brw_perf_query_info *query = obj->query;
1446 int n_counters = obj->query->n_counters;
1447 uint8_t *p = data;
1448
1449 uint64_t *start = brw_bo_map(brw, obj->pipeline_stats.bo, MAP_READ);
1450 uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
1451
1452 for (int i = 0; i < n_counters; i++) {
1453 const struct brw_perf_query_counter *counter = &query->counters[i];
1454 uint64_t value = end[i] - start[i];
1455
1456 if (counter->pipeline_stat.numerator !=
1457 counter->pipeline_stat.denominator) {
1458 value *= counter->pipeline_stat.numerator;
1459 value /= counter->pipeline_stat.denominator;
1460 }
1461
1462 *((uint64_t *)p) = value;
1463 p += 8;
1464 }
1465
1466 brw_bo_unmap(obj->pipeline_stats.bo);
1467
1468 return p - data;
1469 }
1470
1471 /**
1472 * Driver hook for glGetPerfQueryDataINTEL().
1473 */
1474 static void
brw_get_perf_query_data(struct gl_context * ctx,struct gl_perf_query_object * o,GLsizei data_size,GLuint * data,GLuint * bytes_written)1475 brw_get_perf_query_data(struct gl_context *ctx,
1476 struct gl_perf_query_object *o,
1477 GLsizei data_size,
1478 GLuint *data,
1479 GLuint *bytes_written)
1480 {
1481 struct brw_context *brw = brw_context(ctx);
1482 struct brw_perf_query_object *obj = brw_perf_query(o);
1483 int written = 0;
1484
1485 assert(brw_is_perf_query_ready(ctx, o));
1486
1487 DBG("GetData(%d)\n", o->Id);
1488
1489 if (INTEL_DEBUG & DEBUG_PERFMON)
1490 dump_perf_queries(brw);
1491
1492 /* We expect that the frontend only calls this hook when it knows
1493 * that results are available.
1494 */
1495 assert(o->Ready);
1496
1497 switch (obj->query->kind) {
1498 case OA_COUNTERS:
1499 written = get_oa_counter_data(brw, obj, data_size, (uint8_t *)data);
1500 break;
1501
1502 case PIPELINE_STATS:
1503 written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data);
1504 break;
1505 }
1506
1507 if (bytes_written)
1508 *bytes_written = written;
1509 }
1510
1511 static struct gl_perf_query_object *
brw_new_perf_query_object(struct gl_context * ctx,unsigned query_index)1512 brw_new_perf_query_object(struct gl_context *ctx, unsigned query_index)
1513 {
1514 struct brw_context *brw = brw_context(ctx);
1515 const struct brw_perf_query_info *query =
1516 &brw->perfquery.queries[query_index];
1517 struct brw_perf_query_object *obj =
1518 calloc(1, sizeof(struct brw_perf_query_object));
1519
1520 if (!obj)
1521 return NULL;
1522
1523 obj->query = query;
1524
1525 brw->perfquery.n_query_instances++;
1526
1527 return &obj->base;
1528 }
1529
1530 /**
1531 * Driver hook for glDeletePerfQueryINTEL().
1532 */
1533 static void
brw_delete_perf_query(struct gl_context * ctx,struct gl_perf_query_object * o)1534 brw_delete_perf_query(struct gl_context *ctx,
1535 struct gl_perf_query_object *o)
1536 {
1537 struct brw_context *brw = brw_context(ctx);
1538 struct brw_perf_query_object *obj = brw_perf_query(o);
1539
1540 /* We can assume that the frontend waits for a query to complete
1541 * before ever calling into here, so we don't have to worry about
1542 * deleting an in-flight query object.
1543 */
1544 assert(!o->Active);
1545 assert(!o->Used || o->Ready);
1546
1547 DBG("Delete(%d)\n", o->Id);
1548
1549 switch (obj->query->kind) {
1550 case OA_COUNTERS:
1551 if (obj->oa.bo) {
1552 if (!obj->oa.results_accumulated) {
1553 drop_from_unaccumulated_query_list(brw, obj);
1554 dec_n_oa_users(brw);
1555 }
1556
1557 brw_bo_unreference(obj->oa.bo);
1558 obj->oa.bo = NULL;
1559 }
1560
1561 obj->oa.results_accumulated = false;
1562 break;
1563
1564 case PIPELINE_STATS:
1565 if (obj->pipeline_stats.bo) {
1566 brw_bo_unreference(obj->pipeline_stats.bo);
1567 obj->pipeline_stats.bo = NULL;
1568 }
1569 break;
1570 }
1571
1572 free(obj);
1573
1574 /* As an indication that the INTEL_performance_query extension is no
1575 * longer in use, it's a good time to free our cache of sample
1576 * buffers and close any current i915-perf stream.
1577 */
1578 if (--brw->perfquery.n_query_instances == 0) {
1579 free_sample_bufs(brw);
1580 close_perf(brw);
1581 }
1582 }
1583
1584 /******************************************************************************/
1585
1586 static struct brw_perf_query_info *
append_query_info(struct brw_context * brw)1587 append_query_info(struct brw_context *brw)
1588 {
1589 brw->perfquery.queries =
1590 reralloc(brw, brw->perfquery.queries,
1591 struct brw_perf_query_info, ++brw->perfquery.n_queries);
1592
1593 return &brw->perfquery.queries[brw->perfquery.n_queries - 1];
1594 }
1595
1596 static void
add_stat_reg(struct brw_perf_query_info * query,uint32_t reg,uint32_t numerator,uint32_t denominator,const char * name,const char * description)1597 add_stat_reg(struct brw_perf_query_info *query,
1598 uint32_t reg,
1599 uint32_t numerator,
1600 uint32_t denominator,
1601 const char *name,
1602 const char *description)
1603 {
1604 struct brw_perf_query_counter *counter;
1605
1606 assert(query->n_counters < MAX_STAT_COUNTERS);
1607
1608 counter = &query->counters[query->n_counters];
1609 counter->name = name;
1610 counter->desc = description;
1611 counter->type = GL_PERFQUERY_COUNTER_RAW_INTEL;
1612 counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
1613 counter->size = sizeof(uint64_t);
1614 counter->offset = sizeof(uint64_t) * query->n_counters;
1615 counter->pipeline_stat.reg = reg;
1616 counter->pipeline_stat.numerator = numerator;
1617 counter->pipeline_stat.denominator = denominator;
1618
1619 query->n_counters++;
1620 }
1621
1622 static void
add_basic_stat_reg(struct brw_perf_query_info * query,uint32_t reg,const char * name)1623 add_basic_stat_reg(struct brw_perf_query_info *query,
1624 uint32_t reg, const char *name)
1625 {
1626 add_stat_reg(query, reg, 1, 1, name, name);
1627 }
1628
1629 static void
init_pipeline_statistic_query_registers(struct brw_context * brw)1630 init_pipeline_statistic_query_registers(struct brw_context *brw)
1631 {
1632 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1633 struct brw_perf_query_info *query = append_query_info(brw);
1634
1635 query->kind = PIPELINE_STATS;
1636 query->name = "Pipeline Statistics Registers";
1637 query->n_counters = 0;
1638 query->counters =
1639 rzalloc_array(brw, struct brw_perf_query_counter, MAX_STAT_COUNTERS);
1640
1641 add_basic_stat_reg(query, IA_VERTICES_COUNT,
1642 "N vertices submitted");
1643 add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
1644 "N primitives submitted");
1645 add_basic_stat_reg(query, VS_INVOCATION_COUNT,
1646 "N vertex shader invocations");
1647
1648 if (devinfo->gen == 6) {
1649 add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
1650 "SO_PRIM_STORAGE_NEEDED",
1651 "N geometry shader stream-out primitives (total)");
1652 add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
1653 "SO_NUM_PRIMS_WRITTEN",
1654 "N geometry shader stream-out primitives (written)");
1655 } else {
1656 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
1657 "SO_PRIM_STORAGE_NEEDED (Stream 0)",
1658 "N stream-out (stream 0) primitives (total)");
1659 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
1660 "SO_PRIM_STORAGE_NEEDED (Stream 1)",
1661 "N stream-out (stream 1) primitives (total)");
1662 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
1663 "SO_PRIM_STORAGE_NEEDED (Stream 2)",
1664 "N stream-out (stream 2) primitives (total)");
1665 add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
1666 "SO_PRIM_STORAGE_NEEDED (Stream 3)",
1667 "N stream-out (stream 3) primitives (total)");
1668 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
1669 "SO_NUM_PRIMS_WRITTEN (Stream 0)",
1670 "N stream-out (stream 0) primitives (written)");
1671 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
1672 "SO_NUM_PRIMS_WRITTEN (Stream 1)",
1673 "N stream-out (stream 1) primitives (written)");
1674 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
1675 "SO_NUM_PRIMS_WRITTEN (Stream 2)",
1676 "N stream-out (stream 2) primitives (written)");
1677 add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
1678 "SO_NUM_PRIMS_WRITTEN (Stream 3)",
1679 "N stream-out (stream 3) primitives (written)");
1680 }
1681
1682 add_basic_stat_reg(query, HS_INVOCATION_COUNT,
1683 "N TCS shader invocations");
1684 add_basic_stat_reg(query, DS_INVOCATION_COUNT,
1685 "N TES shader invocations");
1686
1687 add_basic_stat_reg(query, GS_INVOCATION_COUNT,
1688 "N geometry shader invocations");
1689 add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
1690 "N geometry shader primitives emitted");
1691
1692 add_basic_stat_reg(query, CL_INVOCATION_COUNT,
1693 "N primitives entering clipping");
1694 add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
1695 "N primitives leaving clipping");
1696
1697 if (devinfo->is_haswell || devinfo->gen == 8)
1698 add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
1699 "N fragment shader invocations",
1700 "N fragment shader invocations");
1701 else
1702 add_basic_stat_reg(query, PS_INVOCATION_COUNT,
1703 "N fragment shader invocations");
1704
1705 add_basic_stat_reg(query, PS_DEPTH_COUNT, "N z-pass fragments");
1706
1707 if (devinfo->gen >= 7)
1708 add_basic_stat_reg(query, CS_INVOCATION_COUNT,
1709 "N compute shader invocations");
1710
1711 query->data_size = sizeof(uint64_t) * query->n_counters;
1712 }
1713
1714 static bool
read_file_uint64(const char * file,uint64_t * val)1715 read_file_uint64(const char *file, uint64_t *val)
1716 {
1717 char buf[32];
1718 int fd, n;
1719
1720 fd = open(file, 0);
1721 if (fd < 0)
1722 return false;
1723 while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 &&
1724 errno == EINTR);
1725 close(fd);
1726 if (n < 0)
1727 return false;
1728
1729 buf[n] = '\0';
1730 *val = strtoull(buf, NULL, 0);
1731
1732 return true;
1733 }
1734
1735 static void
register_oa_config(struct brw_context * brw,const struct brw_perf_query_info * query,uint64_t config_id)1736 register_oa_config(struct brw_context *brw,
1737 const struct brw_perf_query_info *query,
1738 uint64_t config_id)
1739 {
1740 struct brw_perf_query_info *registred_query = append_query_info(brw);
1741 *registred_query = *query;
1742 registred_query->oa_metrics_set_id = config_id;
1743 DBG("metric set registred: id = %" PRIu64", guid = %s\n",
1744 registred_query->oa_metrics_set_id, query->guid);
1745 }
1746
1747 static void
enumerate_sysfs_metrics(struct brw_context * brw,const char * sysfs_dev_dir)1748 enumerate_sysfs_metrics(struct brw_context *brw, const char *sysfs_dev_dir)
1749 {
1750 char buf[256];
1751 DIR *metricsdir = NULL;
1752 struct dirent *metric_entry;
1753 int len;
1754
1755 len = snprintf(buf, sizeof(buf), "%s/metrics", sysfs_dev_dir);
1756 if (len < 0 || len >= sizeof(buf)) {
1757 DBG("Failed to concatenate path to sysfs metrics/ directory\n");
1758 return;
1759 }
1760
1761 metricsdir = opendir(buf);
1762 if (!metricsdir) {
1763 DBG("Failed to open %s: %m\n", buf);
1764 return;
1765 }
1766
1767 while ((metric_entry = readdir(metricsdir))) {
1768 struct hash_entry *entry;
1769
1770 if ((metric_entry->d_type != DT_DIR &&
1771 metric_entry->d_type != DT_LNK) ||
1772 metric_entry->d_name[0] == '.')
1773 continue;
1774
1775 DBG("metric set: %s\n", metric_entry->d_name);
1776 entry = _mesa_hash_table_search(brw->perfquery.oa_metrics_table,
1777 metric_entry->d_name);
1778 if (entry) {
1779 uint64_t id;
1780
1781 len = snprintf(buf, sizeof(buf), "%s/metrics/%s/id",
1782 sysfs_dev_dir, metric_entry->d_name);
1783 if (len < 0 || len >= sizeof(buf)) {
1784 DBG("Failed to concatenate path to sysfs metric id file\n");
1785 continue;
1786 }
1787
1788 if (!read_file_uint64(buf, &id)) {
1789 DBG("Failed to read metric set id from %s: %m", buf);
1790 continue;
1791 }
1792
1793 register_oa_config(brw, (const struct brw_perf_query_info *)entry->data, id);
1794 } else
1795 DBG("metric set not known by mesa (skipping)\n");
1796 }
1797
1798 closedir(metricsdir);
1799 }
1800
1801 static bool
read_sysfs_drm_device_file_uint64(struct brw_context * brw,const char * sysfs_dev_dir,const char * file,uint64_t * value)1802 read_sysfs_drm_device_file_uint64(struct brw_context *brw,
1803 const char *sysfs_dev_dir,
1804 const char *file,
1805 uint64_t *value)
1806 {
1807 char buf[512];
1808 int len;
1809
1810 len = snprintf(buf, sizeof(buf), "%s/%s", sysfs_dev_dir, file);
1811 if (len < 0 || len >= sizeof(buf)) {
1812 DBG("Failed to concatenate sys filename to read u64 from\n");
1813 return false;
1814 }
1815
1816 return read_file_uint64(buf, value);
1817 }
1818
1819 static bool
kernel_has_dynamic_config_support(struct brw_context * brw,const char * sysfs_dev_dir)1820 kernel_has_dynamic_config_support(struct brw_context *brw,
1821 const char *sysfs_dev_dir)
1822 {
1823 __DRIscreen *screen = brw->screen->driScrnPriv;
1824 struct hash_entry *entry;
1825
1826 hash_table_foreach(brw->perfquery.oa_metrics_table, entry) {
1827 struct brw_perf_query_info *query = entry->data;
1828 char config_path[256];
1829 uint64_t config_id;
1830
1831 snprintf(config_path, sizeof(config_path),
1832 "%s/metrics/%s/id", sysfs_dev_dir, query->guid);
1833
1834 /* Look for the test config, which we know we can't replace. */
1835 if (read_file_uint64(config_path, &config_id) && config_id == 1) {
1836 uint32_t mux_regs[] = { 0x9888 /* NOA_WRITE */, 0x0 };
1837 struct drm_i915_perf_oa_config config;
1838
1839 memset(&config, 0, sizeof(config));
1840
1841 memcpy(config.uuid, query->guid, sizeof(config.uuid));
1842
1843 config.n_mux_regs = 1;
1844 config.mux_regs_ptr = (uintptr_t) mux_regs;
1845
1846 if (drmIoctl(screen->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config_id) < 0 &&
1847 errno == ENOENT)
1848 return true;
1849
1850 break;
1851 }
1852 }
1853
1854 return false;
1855 }
1856
1857 static void
init_oa_configs(struct brw_context * brw,const char * sysfs_dev_dir)1858 init_oa_configs(struct brw_context *brw, const char *sysfs_dev_dir)
1859 {
1860 __DRIscreen *screen = brw->screen->driScrnPriv;
1861 struct hash_entry *entry;
1862
1863 hash_table_foreach(brw->perfquery.oa_metrics_table, entry) {
1864 const struct brw_perf_query_info *query = entry->data;
1865 struct drm_i915_perf_oa_config config;
1866 char config_path[256];
1867 uint64_t config_id;
1868 int ret;
1869
1870 snprintf(config_path, sizeof(config_path),
1871 "%s/metrics/%s/id", sysfs_dev_dir, query->guid);
1872
1873 /* Don't recreate already loaded configs. */
1874 if (read_file_uint64(config_path, &config_id)) {
1875 register_oa_config(brw, query, config_id);
1876 continue;
1877 }
1878
1879 memset(&config, 0, sizeof(config));
1880
1881 memcpy(config.uuid, query->guid, sizeof(config.uuid));
1882
1883 config.n_mux_regs = query->n_mux_regs;
1884 config.mux_regs_ptr = (uintptr_t) query->mux_regs;
1885
1886 config.n_boolean_regs = query->n_b_counter_regs;
1887 config.boolean_regs_ptr = (uintptr_t) query->b_counter_regs;
1888
1889 config.n_flex_regs = query->n_flex_regs;
1890 config.flex_regs_ptr = (uintptr_t) query->flex_regs;
1891
1892 ret = drmIoctl(screen->fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &config);
1893 if (ret < 0) {
1894 DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
1895 query->name, query->guid, strerror(errno));
1896 continue;
1897 }
1898
1899 register_oa_config(brw, query, ret);
1900 }
1901 }
1902
1903 static bool
init_oa_sys_vars(struct brw_context * brw,const char * sysfs_dev_dir)1904 init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
1905 {
1906 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1907 uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
1908 __DRIscreen *screen = brw->screen->driScrnPriv;
1909
1910 if (!read_sysfs_drm_device_file_uint64(brw, sysfs_dev_dir,
1911 "gt_min_freq_mhz",
1912 &min_freq_mhz))
1913 return false;
1914
1915 if (!read_sysfs_drm_device_file_uint64(brw, sysfs_dev_dir,
1916 "gt_max_freq_mhz",
1917 &max_freq_mhz))
1918 return false;
1919
1920 brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
1921 brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
1922 brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
1923
1924 brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd);
1925 brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
1926 /* Assuming uniform distribution of subslices per slices. */
1927 brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
1928
1929 if (devinfo->is_haswell) {
1930 brw->perfquery.sys_vars.slice_mask = 0;
1931 brw->perfquery.sys_vars.subslice_mask = 0;
1932
1933 for (int s = 0; s < devinfo->num_slices; s++)
1934 brw->perfquery.sys_vars.slice_mask |= 1U << s;
1935 for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
1936 brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
1937
1938 if (devinfo->gt == 1) {
1939 brw->perfquery.sys_vars.n_eus = 10;
1940 } else if (devinfo->gt == 2) {
1941 brw->perfquery.sys_vars.n_eus = 20;
1942 } else if (devinfo->gt == 3) {
1943 brw->perfquery.sys_vars.n_eus = 40;
1944 } else
1945 unreachable("not reached");
1946 } else {
1947 drm_i915_getparam_t gp;
1948 int ret;
1949 int slice_mask = 0;
1950 int ss_mask = 0;
1951 /* maximum number of slices */
1952 int s_max = devinfo->num_slices;
1953 /* maximum number of subslices per slice (assuming uniform subslices per
1954 * slices)
1955 */
1956 int ss_max = devinfo->num_subslices[0];
1957 uint64_t subslice_mask = 0;
1958 int s;
1959
1960 gp.param = I915_PARAM_SLICE_MASK;
1961 gp.value = &slice_mask;
1962 ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
1963 if (ret)
1964 return false;
1965
1966 gp.param = I915_PARAM_SUBSLICE_MASK;
1967 gp.value = &ss_mask;
1968 ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
1969 if (ret)
1970 return false;
1971
1972 brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
1973 brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
1974 brw->perfquery.sys_vars.slice_mask = slice_mask;
1975
1976 /* Note: the _SUBSLICE_MASK param only reports a global subslice mask
1977 * which applies to all slices.
1978 *
1979 * Note: some of the metrics we have (as described in XML) are
1980 * conditional on a $SubsliceMask variable which is expected to also
1981 * reflect the slice mask by packing together subslice masks for each
1982 * slice in one value..
1983 */
1984 for (s = 0; s < s_max; s++) {
1985 if (slice_mask & (1<<s)) {
1986 subslice_mask |= ss_mask << (ss_max * s);
1987 }
1988 }
1989
1990 brw->perfquery.sys_vars.subslice_mask = subslice_mask;
1991 brw->perfquery.sys_vars.n_eu_sub_slices =
1992 __builtin_popcount(subslice_mask);
1993 }
1994
1995 brw->perfquery.sys_vars.eu_threads_count =
1996 brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
1997
1998 return true;
1999 }
2000
2001 static bool
get_sysfs_dev_dir(struct brw_context * brw,char * path_buf,int path_buf_len)2002 get_sysfs_dev_dir(struct brw_context *brw,
2003 char *path_buf,
2004 int path_buf_len)
2005 {
2006 __DRIscreen *screen = brw->screen->driScrnPriv;
2007 struct stat sb;
2008 int min, maj;
2009 DIR *drmdir;
2010 struct dirent *drm_entry;
2011 int len;
2012
2013 assert(path_buf);
2014 assert(path_buf_len);
2015 path_buf[0] = '\0';
2016
2017 if (fstat(screen->fd, &sb)) {
2018 DBG("Failed to stat DRM fd\n");
2019 return false;
2020 }
2021
2022 maj = major(sb.st_rdev);
2023 min = minor(sb.st_rdev);
2024
2025 if (!S_ISCHR(sb.st_mode)) {
2026 DBG("DRM fd is not a character device as expected\n");
2027 return false;
2028 }
2029
2030 len = snprintf(path_buf, path_buf_len,
2031 "/sys/dev/char/%d:%d/device/drm", maj, min);
2032 if (len < 0 || len >= path_buf_len) {
2033 DBG("Failed to concatenate sysfs path to drm device\n");
2034 return false;
2035 }
2036
2037 drmdir = opendir(path_buf);
2038 if (!drmdir) {
2039 DBG("Failed to open %s: %m\n", path_buf);
2040 return false;
2041 }
2042
2043 while ((drm_entry = readdir(drmdir))) {
2044 if ((drm_entry->d_type == DT_DIR ||
2045 drm_entry->d_type == DT_LNK) &&
2046 strncmp(drm_entry->d_name, "card", 4) == 0)
2047 {
2048 len = snprintf(path_buf, path_buf_len,
2049 "/sys/dev/char/%d:%d/device/drm/%s",
2050 maj, min, drm_entry->d_name);
2051 closedir(drmdir);
2052 if (len < 0 || len >= path_buf_len)
2053 return false;
2054 else
2055 return true;
2056 }
2057 }
2058
2059 closedir(drmdir);
2060
2061 DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
2062 maj, min);
2063
2064 return false;
2065 }
2066
2067 typedef void (*perf_register_oa_queries_t)(struct brw_context *);
2068
2069 static perf_register_oa_queries_t
get_register_queries_function(const struct gen_device_info * devinfo)2070 get_register_queries_function(const struct gen_device_info *devinfo)
2071 {
2072 if (devinfo->is_haswell)
2073 return brw_oa_register_queries_hsw;
2074 if (devinfo->is_cherryview)
2075 return brw_oa_register_queries_chv;
2076 if (devinfo->is_broadwell)
2077 return brw_oa_register_queries_bdw;
2078 if (devinfo->is_broxton)
2079 return brw_oa_register_queries_bxt;
2080 if (devinfo->is_skylake) {
2081 if (devinfo->gt == 2)
2082 return brw_oa_register_queries_sklgt2;
2083 if (devinfo->gt == 3)
2084 return brw_oa_register_queries_sklgt3;
2085 if (devinfo->gt == 4)
2086 return brw_oa_register_queries_sklgt4;
2087 }
2088 if (devinfo->is_kabylake) {
2089 if (devinfo->gt == 2)
2090 return brw_oa_register_queries_kblgt2;
2091 if (devinfo->gt == 3)
2092 return brw_oa_register_queries_kblgt3;
2093 }
2094 if (devinfo->is_geminilake)
2095 return brw_oa_register_queries_glk;
2096 if (devinfo->is_coffeelake) {
2097 if (devinfo->gt == 2)
2098 return brw_oa_register_queries_cflgt2;
2099 if (devinfo->gt == 3)
2100 return brw_oa_register_queries_cflgt3;
2101 }
2102
2103 return NULL;
2104 }
2105
2106 static unsigned
brw_init_perf_query_info(struct gl_context * ctx)2107 brw_init_perf_query_info(struct gl_context *ctx)
2108 {
2109 struct brw_context *brw = brw_context(ctx);
2110 const struct gen_device_info *devinfo = &brw->screen->devinfo;
2111 bool i915_perf_oa_available = false;
2112 struct stat sb;
2113 char sysfs_dev_dir[128];
2114 perf_register_oa_queries_t oa_register;
2115
2116 if (brw->perfquery.n_queries)
2117 return brw->perfquery.n_queries;
2118
2119 init_pipeline_statistic_query_registers(brw);
2120
2121 oa_register = get_register_queries_function(devinfo);
2122
2123 /* The existence of this sysctl parameter implies the kernel supports
2124 * the i915 perf interface.
2125 */
2126 if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
2127
2128 /* If _paranoid == 1 then on Gen8+ we won't be able to access OA
2129 * metrics unless running as root.
2130 */
2131 if (devinfo->is_haswell)
2132 i915_perf_oa_available = true;
2133 else {
2134 uint64_t paranoid = 1;
2135
2136 read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", ¶noid);
2137
2138 if (paranoid == 0 || geteuid() == 0)
2139 i915_perf_oa_available = true;
2140 }
2141 }
2142
2143 if (i915_perf_oa_available &&
2144 oa_register &&
2145 get_sysfs_dev_dir(brw, sysfs_dev_dir, sizeof(sysfs_dev_dir)) &&
2146 init_oa_sys_vars(brw, sysfs_dev_dir))
2147 {
2148 brw->perfquery.oa_metrics_table =
2149 _mesa_hash_table_create(NULL, _mesa_key_hash_string,
2150 _mesa_key_string_equal);
2151
2152 /* Index all the metric sets mesa knows about before looking to see what
2153 * the kernel is advertising.
2154 */
2155 oa_register(brw);
2156
2157 if (likely((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0) &&
2158 kernel_has_dynamic_config_support(brw, sysfs_dev_dir))
2159 init_oa_configs(brw, sysfs_dev_dir);
2160 else
2161 enumerate_sysfs_metrics(brw, sysfs_dev_dir);
2162 }
2163
2164 brw->perfquery.unaccumulated =
2165 ralloc_array(brw, struct brw_perf_query_object *, 2);
2166 brw->perfquery.unaccumulated_elements = 0;
2167 brw->perfquery.unaccumulated_array_size = 2;
2168
2169 exec_list_make_empty(&brw->perfquery.sample_buffers);
2170 exec_list_make_empty(&brw->perfquery.free_sample_buffers);
2171
2172 /* It's convenient to guarantee that this linked list of sample
2173 * buffers is never empty so we add an empty head so when we
2174 * Begin an OA query we can always take a reference on a buffer
2175 * in this list.
2176 */
2177 struct brw_oa_sample_buf *buf = get_free_sample_buf(brw);
2178 exec_list_push_head(&brw->perfquery.sample_buffers, &buf->link);
2179
2180 brw->perfquery.oa_stream_fd = -1;
2181
2182 brw->perfquery.next_query_start_report_id = 1000;
2183
2184 return brw->perfquery.n_queries;
2185 }
2186
2187 void
brw_init_performance_queries(struct brw_context * brw)2188 brw_init_performance_queries(struct brw_context *brw)
2189 {
2190 struct gl_context *ctx = &brw->ctx;
2191
2192 ctx->Driver.InitPerfQueryInfo = brw_init_perf_query_info;
2193 ctx->Driver.GetPerfQueryInfo = brw_get_perf_query_info;
2194 ctx->Driver.GetPerfCounterInfo = brw_get_perf_counter_info;
2195 ctx->Driver.NewPerfQueryObject = brw_new_perf_query_object;
2196 ctx->Driver.DeletePerfQuery = brw_delete_perf_query;
2197 ctx->Driver.BeginPerfQuery = brw_begin_perf_query;
2198 ctx->Driver.EndPerfQuery = brw_end_perf_query;
2199 ctx->Driver.WaitPerfQuery = brw_wait_perf_query;
2200 ctx->Driver.IsPerfQueryReady = brw_is_perf_query_ready;
2201 ctx->Driver.GetPerfQueryData = brw_get_perf_query_data;
2202 }
2203