• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <dirent.h>
25 
26 #include <sys/types.h>
27 #include <sys/stat.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <errno.h>
31 
32 #ifndef HAVE_DIRENT_D_TYPE
33 #include <limits.h> // PATH_MAX
34 #endif
35 
36 #include <drm-uapi/i915_drm.h>
37 
38 #include "common/intel_gem.h"
39 
40 #include "dev/intel_debug.h"
41 #include "dev/intel_device_info.h"
42 
43 #include "perf/intel_perf.h"
44 #include "perf/intel_perf_regs.h"
45 #include "perf/intel_perf_mdapi.h"
46 #include "perf/intel_perf_metrics.h"
47 #include "perf/intel_perf_private.h"
48 
49 #include "util/bitscan.h"
50 #include "util/macros.h"
51 #include "util/mesa-sha1.h"
52 #include "util/u_math.h"
53 
54 #define FILE_DEBUG_FLAG DEBUG_PERFMON
55 
56 static bool
is_dir_or_link(const struct dirent * entry,const char * parent_dir)57 is_dir_or_link(const struct dirent *entry, const char *parent_dir)
58 {
59 #ifdef HAVE_DIRENT_D_TYPE
60    return entry->d_type == DT_DIR || entry->d_type == DT_LNK;
61 #else
62    struct stat st;
63    char path[PATH_MAX + 1];
64    snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name);
65    lstat(path, &st);
66    return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode);
67 #endif
68 }
69 
70 static bool
get_sysfs_dev_dir(struct intel_perf_config * perf,int fd)71 get_sysfs_dev_dir(struct intel_perf_config *perf, int fd)
72 {
73    struct stat sb;
74    int min, maj;
75    DIR *drmdir;
76    struct dirent *drm_entry;
77    int len;
78 
79    perf->sysfs_dev_dir[0] = '\0';
80 
81    if (INTEL_DEBUG(DEBUG_NO_OACONFIG))
82       return true;
83 
84    if (fstat(fd, &sb)) {
85       DBG("Failed to stat DRM fd\n");
86       return false;
87    }
88 
89    maj = major(sb.st_rdev);
90    min = minor(sb.st_rdev);
91 
92    if (!S_ISCHR(sb.st_mode)) {
93       DBG("DRM fd is not a character device as expected\n");
94       return false;
95    }
96 
97    len = snprintf(perf->sysfs_dev_dir,
98                   sizeof(perf->sysfs_dev_dir),
99                   "/sys/dev/char/%d:%d/device/drm", maj, min);
100    if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) {
101       DBG("Failed to concatenate sysfs path to drm device\n");
102       return false;
103    }
104 
105    drmdir = opendir(perf->sysfs_dev_dir);
106    if (!drmdir) {
107       DBG("Failed to open %s: %m\n", perf->sysfs_dev_dir);
108       return false;
109    }
110 
111    while ((drm_entry = readdir(drmdir))) {
112       if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) &&
113           strncmp(drm_entry->d_name, "card", 4) == 0)
114       {
115          len = snprintf(perf->sysfs_dev_dir,
116                         sizeof(perf->sysfs_dev_dir),
117                         "/sys/dev/char/%d:%d/device/drm/%s",
118                         maj, min, drm_entry->d_name);
119          closedir(drmdir);
120          if (len < 0 || len >= sizeof(perf->sysfs_dev_dir))
121             return false;
122          else
123             return true;
124       }
125    }
126 
127    closedir(drmdir);
128 
129    DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n",
130        maj, min);
131 
132    return false;
133 }
134 
135 static bool
read_file_uint64(const char * file,uint64_t * val)136 read_file_uint64(const char *file, uint64_t *val)
137 {
138     char buf[32];
139     int fd, n;
140 
141     fd = open(file, 0);
142     if (fd < 0)
143        return false;
144     while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 &&
145            errno == EINTR);
146     close(fd);
147     if (n < 0)
148        return false;
149 
150     buf[n] = '\0';
151     *val = strtoull(buf, NULL, 0);
152 
153     return true;
154 }
155 
156 static bool
read_sysfs_drm_device_file_uint64(struct intel_perf_config * perf,const char * file,uint64_t * value)157 read_sysfs_drm_device_file_uint64(struct intel_perf_config *perf,
158                                   const char *file,
159                                   uint64_t *value)
160 {
161    char buf[512];
162    int len;
163 
164    len = snprintf(buf, sizeof(buf), "%s/%s", perf->sysfs_dev_dir, file);
165    if (len < 0 || len >= sizeof(buf)) {
166       DBG("Failed to concatenate sys filename to read u64 from\n");
167       return false;
168    }
169 
170    return read_file_uint64(buf, value);
171 }
172 
173 static void
register_oa_config(struct intel_perf_config * perf,const struct intel_device_info * devinfo,const struct intel_perf_query_info * query,uint64_t config_id)174 register_oa_config(struct intel_perf_config *perf,
175                    const struct intel_device_info *devinfo,
176                    const struct intel_perf_query_info *query,
177                    uint64_t config_id)
178 {
179    struct intel_perf_query_info *registered_query =
180       intel_perf_append_query_info(perf, 0);
181 
182    *registered_query = *query;
183    registered_query->oa_metrics_set_id = config_id;
184    DBG("metric set registered: id = %" PRIu64", guid = %s\n",
185        registered_query->oa_metrics_set_id, query->guid);
186 }
187 
188 static void
enumerate_sysfs_metrics(struct intel_perf_config * perf,const struct intel_device_info * devinfo)189 enumerate_sysfs_metrics(struct intel_perf_config *perf,
190                         const struct intel_device_info *devinfo)
191 {
192    DIR *metricsdir = NULL;
193    struct dirent *metric_entry;
194    char buf[256];
195    int len;
196 
197    len = snprintf(buf, sizeof(buf), "%s/metrics", perf->sysfs_dev_dir);
198    if (len < 0 || len >= sizeof(buf)) {
199       DBG("Failed to concatenate path to sysfs metrics/ directory\n");
200       return;
201    }
202 
203    metricsdir = opendir(buf);
204    if (!metricsdir) {
205       DBG("Failed to open %s: %m\n", buf);
206       return;
207    }
208 
209    while ((metric_entry = readdir(metricsdir))) {
210       struct hash_entry *entry;
211       if (!is_dir_or_link(metric_entry, buf) ||
212           metric_entry->d_name[0] == '.')
213          continue;
214 
215       DBG("metric set: %s\n", metric_entry->d_name);
216       entry = _mesa_hash_table_search(perf->oa_metrics_table,
217                                       metric_entry->d_name);
218       if (entry) {
219          uint64_t id;
220          if (!intel_perf_load_metric_id(perf, metric_entry->d_name, &id)) {
221             DBG("Failed to read metric set id from %s: %m", buf);
222             continue;
223          }
224 
225          register_oa_config(perf, devinfo,
226                             (const struct intel_perf_query_info *)entry->data, id);
227       } else
228          DBG("metric set not known by mesa (skipping)\n");
229    }
230 
231    closedir(metricsdir);
232 }
233 
234 static void
add_all_metrics(struct intel_perf_config * perf,const struct intel_device_info * devinfo)235 add_all_metrics(struct intel_perf_config *perf,
236                 const struct intel_device_info *devinfo)
237 {
238    hash_table_foreach(perf->oa_metrics_table, entry) {
239       const struct intel_perf_query_info *query = entry->data;
240       register_oa_config(perf, devinfo, query, 0);
241    }
242 }
243 
244 static bool
kernel_has_dynamic_config_support(struct intel_perf_config * perf,int fd)245 kernel_has_dynamic_config_support(struct intel_perf_config *perf, int fd)
246 {
247    uint64_t invalid_config_id = UINT64_MAX;
248 
249    return intel_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG,
250                     &invalid_config_id) < 0 && errno == ENOENT;
251 }
252 
253 static bool
i915_query_perf_config_supported(struct intel_perf_config * perf,int fd)254 i915_query_perf_config_supported(struct intel_perf_config *perf, int fd)
255 {
256    int32_t length = 0;
257    return !intel_i915_query_flags(fd, DRM_I915_QUERY_PERF_CONFIG,
258                                   DRM_I915_QUERY_PERF_CONFIG_LIST,
259                                   NULL, &length);
260 }
261 
262 static bool
i915_query_perf_config_data(struct intel_perf_config * perf,int fd,const char * guid,struct drm_i915_perf_oa_config * config)263 i915_query_perf_config_data(struct intel_perf_config *perf,
264                             int fd, const char *guid,
265                             struct drm_i915_perf_oa_config *config)
266 {
267    char data[sizeof(struct drm_i915_query_perf_config) +
268              sizeof(struct drm_i915_perf_oa_config)] = {};
269    struct drm_i915_query_perf_config *query = (void *)data;
270 
271    memcpy(query->uuid, guid, sizeof(query->uuid));
272    memcpy(query->data, config, sizeof(*config));
273 
274    int32_t item_length = sizeof(data);
275    if (intel_i915_query_flags(fd, DRM_I915_QUERY_PERF_CONFIG,
276                               DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID,
277                               query, &item_length))
278       return false;
279 
280    memcpy(config, query->data, sizeof(*config));
281 
282    return true;
283 }
284 
285 bool
intel_perf_load_metric_id(struct intel_perf_config * perf_cfg,const char * guid,uint64_t * metric_id)286 intel_perf_load_metric_id(struct intel_perf_config *perf_cfg,
287                           const char *guid,
288                           uint64_t *metric_id)
289 {
290    char config_path[280];
291 
292    snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id",
293             perf_cfg->sysfs_dev_dir, guid);
294 
295    /* Don't recreate already loaded configs. */
296    return read_file_uint64(config_path, metric_id);
297 }
298 
299 static uint64_t
i915_add_config(struct intel_perf_config * perf,int fd,const struct intel_perf_registers * config,const char * guid)300 i915_add_config(struct intel_perf_config *perf, int fd,
301                 const struct intel_perf_registers *config,
302                 const char *guid)
303 {
304    struct drm_i915_perf_oa_config i915_config = { 0, };
305 
306    memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid));
307 
308    i915_config.n_mux_regs = config->n_mux_regs;
309    i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs);
310 
311    i915_config.n_boolean_regs = config->n_b_counter_regs;
312    i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs);
313 
314    i915_config.n_flex_regs = config->n_flex_regs;
315    i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs);
316 
317    int ret = intel_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config);
318    return ret > 0 ? ret : 0;
319 }
320 
321 static void
init_oa_configs(struct intel_perf_config * perf,int fd,const struct intel_device_info * devinfo)322 init_oa_configs(struct intel_perf_config *perf, int fd,
323                 const struct intel_device_info *devinfo)
324 {
325    hash_table_foreach(perf->oa_metrics_table, entry) {
326       const struct intel_perf_query_info *query = entry->data;
327       uint64_t config_id;
328 
329       if (intel_perf_load_metric_id(perf, query->guid, &config_id)) {
330          DBG("metric set: %s (already loaded)\n", query->guid);
331          register_oa_config(perf, devinfo, query, config_id);
332          continue;
333       }
334 
335       int ret = i915_add_config(perf, fd, &query->config, query->guid);
336       if (ret < 0) {
337          DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n",
338              query->name, query->guid, strerror(errno));
339          continue;
340       }
341 
342       register_oa_config(perf, devinfo, query, ret);
343       DBG("metric set: %s (added)\n", query->guid);
344    }
345 }
346 
347 static void
compute_topology_builtins(struct intel_perf_config * perf)348 compute_topology_builtins(struct intel_perf_config *perf)
349 {
350    const struct intel_device_info *devinfo = &perf->devinfo;
351 
352    perf->sys_vars.slice_mask = devinfo->slice_masks;
353    perf->sys_vars.n_eu_slices = devinfo->num_slices;
354 
355    perf->sys_vars.n_eu_slice0123 = 0;
356    for (int s = 0; s < MIN2(4, devinfo->max_slices); s++) {
357       if (!intel_device_info_slice_available(devinfo, s))
358          continue;
359 
360       for (int ss = 0; ss < devinfo->max_subslices_per_slice; ss++) {
361          if (!intel_device_info_subslice_available(devinfo, s, ss))
362             continue;
363 
364          for (int eu = 0; eu < devinfo->max_eus_per_subslice; eu++) {
365             if (intel_device_info_eu_available(devinfo, s, ss, eu))
366                perf->sys_vars.n_eu_slice0123++;
367          }
368       }
369    }
370 
371    for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) {
372       perf->sys_vars.n_eu_sub_slices +=
373          util_bitcount(devinfo->subslice_masks[i]);
374    }
375 
376    for (int i = 0; i < sizeof(devinfo->eu_masks); i++)
377       perf->sys_vars.n_eus += util_bitcount(devinfo->eu_masks[i]);
378 
379    /* The subslice mask builtin contains bits for all slices. Prior to Gfx11
380     * it had groups of 3bits for each slice, on Gfx11 and above it's 8bits for
381     * each slice.
382     *
383     * Ideally equations would be updated to have a slice/subslice query
384     * function/operator.
385     */
386    perf->sys_vars.subslice_mask = 0;
387 
388    int bits_per_subslice = devinfo->ver >= 11 ? 8 : 3;
389 
390    for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) {
391       for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) {
392          if (intel_device_info_subslice_available(devinfo, s, ss))
393             perf->sys_vars.subslice_mask |= 1ULL << (s * bits_per_subslice + ss);
394       }
395    }
396 }
397 
398 static bool
init_oa_sys_vars(struct intel_perf_config * perf,bool use_register_snapshots)399 init_oa_sys_vars(struct intel_perf_config *perf,
400                  bool use_register_snapshots)
401 {
402    uint64_t min_freq_mhz = 0, max_freq_mhz = 0;
403 
404    if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
405       if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz))
406          return false;
407 
408       if (!read_sysfs_drm_device_file_uint64(perf,  "gt_max_freq_mhz", &max_freq_mhz))
409          return false;
410    } else {
411       min_freq_mhz = 300;
412       max_freq_mhz = 1000;
413    }
414 
415    memset(&perf->sys_vars, 0, sizeof(perf->sys_vars));
416    perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000;
417    perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
418    perf->sys_vars.query_mode = use_register_snapshots;
419    compute_topology_builtins(perf);
420 
421    return true;
422 }
423 
424 typedef void (*perf_register_oa_queries_t)(struct intel_perf_config *);
425 
426 static perf_register_oa_queries_t
get_register_queries_function(const struct intel_device_info * devinfo)427 get_register_queries_function(const struct intel_device_info *devinfo)
428 {
429    switch (devinfo->platform) {
430    case INTEL_PLATFORM_HSW:
431       return intel_oa_register_queries_hsw;
432    case INTEL_PLATFORM_CHV:
433       return intel_oa_register_queries_chv;
434    case INTEL_PLATFORM_BDW:
435       return intel_oa_register_queries_bdw;
436    case INTEL_PLATFORM_BXT:
437       return intel_oa_register_queries_bxt;
438    case INTEL_PLATFORM_SKL:
439       if (devinfo->gt == 2)
440          return intel_oa_register_queries_sklgt2;
441       if (devinfo->gt == 3)
442          return intel_oa_register_queries_sklgt3;
443       if (devinfo->gt == 4)
444          return intel_oa_register_queries_sklgt4;
445       return NULL;
446    case INTEL_PLATFORM_KBL:
447       if (devinfo->gt == 2)
448          return intel_oa_register_queries_kblgt2;
449       if (devinfo->gt == 3)
450          return intel_oa_register_queries_kblgt3;
451       return NULL;
452    case INTEL_PLATFORM_GLK:
453       return intel_oa_register_queries_glk;
454    case INTEL_PLATFORM_CFL:
455       if (devinfo->gt == 2)
456          return intel_oa_register_queries_cflgt2;
457       if (devinfo->gt == 3)
458          return intel_oa_register_queries_cflgt3;
459       return NULL;
460    case INTEL_PLATFORM_ICL:
461       return intel_oa_register_queries_icl;
462    case INTEL_PLATFORM_EHL:
463       return intel_oa_register_queries_ehl;
464    case INTEL_PLATFORM_TGL:
465       if (devinfo->gt == 1)
466          return intel_oa_register_queries_tglgt1;
467       if (devinfo->gt == 2)
468          return intel_oa_register_queries_tglgt2;
469       return NULL;
470    case INTEL_PLATFORM_RKL:
471       return intel_oa_register_queries_rkl;
472    case INTEL_PLATFORM_DG1:
473       return intel_oa_register_queries_dg1;
474    case INTEL_PLATFORM_ADL:
475       return intel_oa_register_queries_adl;
476    default:
477       return NULL;
478    }
479 }
480 
481 static int
intel_perf_compare_counter_names(const void * v1,const void * v2)482 intel_perf_compare_counter_names(const void *v1, const void *v2)
483 {
484    const struct intel_perf_query_counter *c1 = v1;
485    const struct intel_perf_query_counter *c2 = v2;
486 
487    return strcmp(c1->name, c2->name);
488 }
489 
490 static void
sort_query(struct intel_perf_query_info * q)491 sort_query(struct intel_perf_query_info *q)
492 {
493    qsort(q->counters, q->n_counters, sizeof(q->counters[0]),
494          intel_perf_compare_counter_names);
495 }
496 
497 static void
load_pipeline_statistic_metrics(struct intel_perf_config * perf_cfg,const struct intel_device_info * devinfo)498 load_pipeline_statistic_metrics(struct intel_perf_config *perf_cfg,
499                                 const struct intel_device_info *devinfo)
500 {
501    struct intel_perf_query_info *query =
502       intel_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS);
503 
504    query->kind = INTEL_PERF_QUERY_TYPE_PIPELINE;
505    query->name = "Pipeline Statistics Registers";
506 
507    intel_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT,
508                                        "N vertices submitted");
509    intel_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
510                                        "N primitives submitted");
511    intel_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT,
512                                        "N vertex shader invocations");
513 
514    if (devinfo->ver == 6) {
515       intel_perf_query_add_stat_reg(query, GFX6_SO_PRIM_STORAGE_NEEDED, 1, 1,
516                                     "SO_PRIM_STORAGE_NEEDED",
517                                     "N geometry shader stream-out primitives (total)");
518       intel_perf_query_add_stat_reg(query, GFX6_SO_NUM_PRIMS_WRITTEN, 1, 1,
519                                     "SO_NUM_PRIMS_WRITTEN",
520                                     "N geometry shader stream-out primitives (written)");
521    } else {
522       intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
523                                     "SO_PRIM_STORAGE_NEEDED (Stream 0)",
524                                     "N stream-out (stream 0) primitives (total)");
525       intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
526                                     "SO_PRIM_STORAGE_NEEDED (Stream 1)",
527                                     "N stream-out (stream 1) primitives (total)");
528       intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
529                                     "SO_PRIM_STORAGE_NEEDED (Stream 2)",
530                                     "N stream-out (stream 2) primitives (total)");
531       intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
532                                     "SO_PRIM_STORAGE_NEEDED (Stream 3)",
533                                     "N stream-out (stream 3) primitives (total)");
534       intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
535                                     "SO_NUM_PRIMS_WRITTEN (Stream 0)",
536                                     "N stream-out (stream 0) primitives (written)");
537       intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
538                                     "SO_NUM_PRIMS_WRITTEN (Stream 1)",
539                                     "N stream-out (stream 1) primitives (written)");
540       intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
541                                     "SO_NUM_PRIMS_WRITTEN (Stream 2)",
542                                     "N stream-out (stream 2) primitives (written)");
543       intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
544                                     "SO_NUM_PRIMS_WRITTEN (Stream 3)",
545                                     "N stream-out (stream 3) primitives (written)");
546    }
547 
548    intel_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT,
549                                        "N TCS shader invocations");
550    intel_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT,
551                                        "N TES shader invocations");
552 
553    intel_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT,
554                                        "N geometry shader invocations");
555    intel_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
556                                        "N geometry shader primitives emitted");
557 
558    intel_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT,
559                                        "N primitives entering clipping");
560    intel_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
561                                        "N primitives leaving clipping");
562 
563    if (devinfo->verx10 == 75 || devinfo->ver == 8) {
564       intel_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
565                                     "N fragment shader invocations",
566                                     "N fragment shader invocations");
567    } else {
568       intel_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT,
569                                           "N fragment shader invocations");
570    }
571 
572    intel_perf_query_add_basic_stat_reg(query, PS_DEPTH_COUNT,
573                                        "N z-pass fragments");
574 
575    if (devinfo->ver >= 7) {
576       intel_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
577                                           "N compute shader invocations");
578    }
579 
580    query->data_size = sizeof(uint64_t) * query->n_counters;
581 
582    sort_query(query);
583 }
584 
585 static int
i915_perf_version(int drm_fd)586 i915_perf_version(int drm_fd)
587 {
588    int tmp;
589    drm_i915_getparam_t gp = {
590       .param = I915_PARAM_PERF_REVISION,
591       .value = &tmp,
592    };
593 
594    int ret = intel_ioctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp);
595 
596    /* Return 0 if this getparam is not supported, the first version supported
597     * is 1.
598     */
599    return ret < 0 ? 0 : tmp;
600 }
601 
602 static void
i915_get_sseu(int drm_fd,struct drm_i915_gem_context_param_sseu * sseu)603 i915_get_sseu(int drm_fd, struct drm_i915_gem_context_param_sseu *sseu)
604 {
605    struct drm_i915_gem_context_param arg = {
606       .param = I915_CONTEXT_PARAM_SSEU,
607       .size = sizeof(*sseu),
608       .value = to_user_pointer(sseu)
609    };
610 
611    intel_ioctl(drm_fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg);
612 }
613 
614 static inline int
compare_str_or_null(const char * s1,const char * s2)615 compare_str_or_null(const char *s1, const char *s2)
616 {
617    if (s1 == NULL && s2 == NULL)
618       return 0;
619    if (s1 == NULL)
620       return -1;
621    if (s2 == NULL)
622       return 1;
623 
624    return strcmp(s1, s2);
625 }
626 
627 static int
compare_counter_categories_and_names(const void * _c1,const void * _c2)628 compare_counter_categories_and_names(const void *_c1, const void *_c2)
629 {
630    const struct intel_perf_query_counter_info *c1 = (const struct intel_perf_query_counter_info *)_c1;
631    const struct intel_perf_query_counter_info *c2 = (const struct intel_perf_query_counter_info *)_c2;
632 
633    /* pipeline counters don't have an assigned category */
634    int r = compare_str_or_null(c1->counter->category, c2->counter->category);
635    if (r)
636       return r;
637 
638    return strcmp(c1->counter->name, c2->counter->name);
639 }
640 
641 static void
build_unique_counter_list(struct intel_perf_config * perf)642 build_unique_counter_list(struct intel_perf_config *perf)
643 {
644    assert(perf->n_queries < 64);
645 
646    size_t max_counters = 0;
647 
648    for (int q = 0; q < perf->n_queries; q++)
649       max_counters += perf->queries[q].n_counters;
650 
651    /*
652     * Allocate big enough array to hold maximum possible number of counters.
653     * We can't alloc it small and realloc when needed because the hash table
654     * below contains pointers to this array.
655     */
656    struct intel_perf_query_counter_info *counter_infos =
657          ralloc_array_size(perf, sizeof(counter_infos[0]), max_counters);
658 
659    perf->n_counters = 0;
660 
661    struct hash_table *counters_table =
662       _mesa_hash_table_create(perf,
663                               _mesa_hash_string,
664                               _mesa_key_string_equal);
665    struct hash_entry *entry;
666    for (int q = 0; q < perf->n_queries ; q++) {
667       struct intel_perf_query_info *query = &perf->queries[q];
668 
669       for (int c = 0; c < query->n_counters; c++) {
670          struct intel_perf_query_counter *counter;
671          struct intel_perf_query_counter_info *counter_info;
672 
673          counter = &query->counters[c];
674          entry = _mesa_hash_table_search(counters_table, counter->symbol_name);
675 
676          if (entry) {
677             counter_info = entry->data;
678             counter_info->query_mask |= BITFIELD64_BIT(q);
679             continue;
680          }
681          assert(perf->n_counters < max_counters);
682 
683          counter_info = &counter_infos[perf->n_counters++];
684          counter_info->counter = counter;
685          counter_info->query_mask = BITFIELD64_BIT(q);
686 
687          counter_info->location.group_idx = q;
688          counter_info->location.counter_idx = c;
689 
690          _mesa_hash_table_insert(counters_table, counter->symbol_name, counter_info);
691       }
692    }
693 
694    _mesa_hash_table_destroy(counters_table, NULL);
695 
696    /* Now we can realloc counter_infos array because hash table doesn't exist. */
697    perf->counter_infos = reralloc_array_size(perf, counter_infos,
698          sizeof(counter_infos[0]), perf->n_counters);
699 
700    qsort(perf->counter_infos, perf->n_counters, sizeof(perf->counter_infos[0]),
701          compare_counter_categories_and_names);
702 }
703 
704 static bool
oa_metrics_available(struct intel_perf_config * perf,int fd,const struct intel_device_info * devinfo,bool use_register_snapshots)705 oa_metrics_available(struct intel_perf_config *perf, int fd,
706                      const struct intel_device_info *devinfo,
707                      bool use_register_snapshots)
708 {
709    perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
710    bool i915_perf_oa_available = false;
711    struct stat sb;
712 
713    perf->devinfo = *devinfo;
714    perf->i915_query_supported = i915_query_perf_config_supported(perf, fd);
715    perf->i915_perf_version = i915_perf_version(fd);
716 
717    /* TODO: We should query this from i915 */
718    if (intel_device_info_is_dg2(devinfo))
719       perf->oa_timestamp_shift = 1;
720 
721    perf->oa_timestamp_mask =
722       0xffffffffffffffffull >> (32 + perf->oa_timestamp_shift);
723 
724    /* Record the default SSEU configuration. */
725    i915_get_sseu(fd, &perf->sseu);
726 
727    /* The existence of this sysctl parameter implies the kernel supports
728     * the i915 perf interface.
729     */
730    if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) {
731 
732       /* If _paranoid == 1 then on Gfx8+ we won't be able to access OA
733        * metrics unless running as root.
734        */
735       if (devinfo->platform == INTEL_PLATFORM_HSW)
736          i915_perf_oa_available = true;
737       else {
738          uint64_t paranoid = 1;
739 
740          read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", &paranoid);
741 
742          if (paranoid == 0 || geteuid() == 0)
743             i915_perf_oa_available = true;
744       }
745 
746       perf->platform_supported = oa_register != NULL;
747    }
748 
749    return i915_perf_oa_available &&
750           oa_register &&
751           get_sysfs_dev_dir(perf, fd) &&
752           init_oa_sys_vars(perf, use_register_snapshots);
753 }
754 
755 static void
load_oa_metrics(struct intel_perf_config * perf,int fd,const struct intel_device_info * devinfo)756 load_oa_metrics(struct intel_perf_config *perf, int fd,
757                 const struct intel_device_info *devinfo)
758 {
759    int existing_queries = perf->n_queries;
760 
761    perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo);
762 
763    perf->oa_metrics_table =
764       _mesa_hash_table_create(perf, _mesa_hash_string,
765                               _mesa_key_string_equal);
766 
767    /* Index all the metric sets mesa knows about before looking to see what
768     * the kernel is advertising.
769     */
770    oa_register(perf);
771 
772    if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
773       if (kernel_has_dynamic_config_support(perf, fd))
774          init_oa_configs(perf, fd, devinfo);
775       else
776          enumerate_sysfs_metrics(perf, devinfo);
777    } else {
778       add_all_metrics(perf, devinfo);
779    }
780 
781    /* sort counters in each individual group created by this function by name */
782    for (int i = existing_queries; i < perf->n_queries; ++i)
783       sort_query(&perf->queries[i]);
784 
785    /* Select a fallback OA metric. Look for the TestOa metric or use the last
786     * one if no present (on HSW).
787     */
788    for (int i = existing_queries; i < perf->n_queries; i++) {
789       if (perf->queries[i].symbol_name &&
790           strcmp(perf->queries[i].symbol_name, "TestOa") == 0) {
791          perf->fallback_raw_oa_metric = perf->queries[i].oa_metrics_set_id;
792          break;
793       }
794    }
795    if (perf->fallback_raw_oa_metric == 0 && perf->n_queries > 0)
796       perf->fallback_raw_oa_metric = perf->queries[perf->n_queries - 1].oa_metrics_set_id;
797 }
798 
799 struct intel_perf_registers *
intel_perf_load_configuration(struct intel_perf_config * perf_cfg,int fd,const char * guid)800 intel_perf_load_configuration(struct intel_perf_config *perf_cfg, int fd, const char *guid)
801 {
802    if (!perf_cfg->i915_query_supported)
803       return NULL;
804 
805    struct drm_i915_perf_oa_config i915_config = { 0, };
806    if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config))
807       return NULL;
808 
809    struct intel_perf_registers *config = rzalloc(NULL, struct intel_perf_registers);
810    config->n_flex_regs = i915_config.n_flex_regs;
811    config->flex_regs = rzalloc_array(config, struct intel_perf_query_register_prog, config->n_flex_regs);
812    config->n_mux_regs = i915_config.n_mux_regs;
813    config->mux_regs = rzalloc_array(config, struct intel_perf_query_register_prog, config->n_mux_regs);
814    config->n_b_counter_regs = i915_config.n_boolean_regs;
815    config->b_counter_regs = rzalloc_array(config, struct intel_perf_query_register_prog, config->n_b_counter_regs);
816 
817    /*
818     * struct intel_perf_query_register_prog maps exactly to the tuple of
819     * (register offset, register value) returned by the i915.
820     */
821    i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs);
822    i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs);
823    i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs);
824    if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) {
825       ralloc_free(config);
826       return NULL;
827    }
828 
829    return config;
830 }
831 
832 uint64_t
intel_perf_store_configuration(struct intel_perf_config * perf_cfg,int fd,const struct intel_perf_registers * config,const char * guid)833 intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int fd,
834                                const struct intel_perf_registers *config,
835                                const char *guid)
836 {
837    if (guid)
838       return i915_add_config(perf_cfg, fd, config, guid);
839 
840    struct mesa_sha1 sha1_ctx;
841    _mesa_sha1_init(&sha1_ctx);
842 
843    if (config->flex_regs) {
844       _mesa_sha1_update(&sha1_ctx, config->flex_regs,
845                         sizeof(config->flex_regs[0]) *
846                         config->n_flex_regs);
847    }
848    if (config->mux_regs) {
849       _mesa_sha1_update(&sha1_ctx, config->mux_regs,
850                         sizeof(config->mux_regs[0]) *
851                         config->n_mux_regs);
852    }
853    if (config->b_counter_regs) {
854       _mesa_sha1_update(&sha1_ctx, config->b_counter_regs,
855                         sizeof(config->b_counter_regs[0]) *
856                         config->n_b_counter_regs);
857    }
858 
859    uint8_t hash[20];
860    _mesa_sha1_final(&sha1_ctx, hash);
861 
862    char formatted_hash[41];
863    _mesa_sha1_format(formatted_hash, hash);
864 
865    char generated_guid[37];
866    snprintf(generated_guid, sizeof(generated_guid),
867             "%.8s-%.4s-%.4s-%.4s-%.12s",
868             &formatted_hash[0], &formatted_hash[8],
869             &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4],
870             &formatted_hash[8 + 4 + 4 + 4]);
871 
872    /* Check if already present. */
873    uint64_t id;
874    if (intel_perf_load_metric_id(perf_cfg, generated_guid, &id))
875       return id;
876 
877    return i915_add_config(perf_cfg, fd, config, generated_guid);
878 }
879 
880 static uint64_t
get_passes_mask(struct intel_perf_config * perf,const uint32_t * counter_indices,uint32_t counter_indices_count)881 get_passes_mask(struct intel_perf_config *perf,
882                 const uint32_t *counter_indices,
883                 uint32_t counter_indices_count)
884 {
885    uint64_t queries_mask = 0;
886 
887    assert(perf->n_queries < 64);
888 
889    /* Compute the number of passes by going through all counters N times (with
890     * N the number of queries) to make sure we select the most constraining
891     * counters first and look at the more flexible ones (that could be
892     * obtained from multiple queries) later. That way we minimize the number
893     * of passes required.
894     */
895    for (uint32_t q = 0; q < perf->n_queries; q++) {
896       for (uint32_t i = 0; i < counter_indices_count; i++) {
897          assert(counter_indices[i] < perf->n_counters);
898 
899          uint32_t idx = counter_indices[i];
900          if (util_bitcount64(perf->counter_infos[idx].query_mask) != (q + 1))
901             continue;
902 
903          if (queries_mask & perf->counter_infos[idx].query_mask)
904             continue;
905 
906          queries_mask |= BITFIELD64_BIT(ffsll(perf->counter_infos[idx].query_mask) - 1);
907       }
908    }
909 
910    return queries_mask;
911 }
912 
913 uint32_t
intel_perf_get_n_passes(struct intel_perf_config * perf,const uint32_t * counter_indices,uint32_t counter_indices_count,struct intel_perf_query_info ** pass_queries)914 intel_perf_get_n_passes(struct intel_perf_config *perf,
915                         const uint32_t *counter_indices,
916                         uint32_t counter_indices_count,
917                         struct intel_perf_query_info **pass_queries)
918 {
919    uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count);
920 
921    if (pass_queries) {
922       uint32_t pass = 0;
923       for (uint32_t q = 0; q < perf->n_queries; q++) {
924          if ((1ULL << q) & queries_mask)
925             pass_queries[pass++] = &perf->queries[q];
926       }
927    }
928 
929    return util_bitcount64(queries_mask);
930 }
931 
932 void
intel_perf_get_counters_passes(struct intel_perf_config * perf,const uint32_t * counter_indices,uint32_t counter_indices_count,struct intel_perf_counter_pass * counter_pass)933 intel_perf_get_counters_passes(struct intel_perf_config *perf,
934                                const uint32_t *counter_indices,
935                                uint32_t counter_indices_count,
936                                struct intel_perf_counter_pass *counter_pass)
937 {
938    uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count);
939    ASSERTED uint32_t n_passes = util_bitcount64(queries_mask);
940 
941    for (uint32_t i = 0; i < counter_indices_count; i++) {
942       assert(counter_indices[i] < perf->n_counters);
943 
944       uint32_t idx = counter_indices[i];
945       counter_pass[i].counter = perf->counter_infos[idx].counter;
946 
947       uint32_t query_idx = ffsll(perf->counter_infos[idx].query_mask & queries_mask) - 1;
948       counter_pass[i].query = &perf->queries[query_idx];
949 
950       uint32_t clear_bits = 63 - query_idx;
951       counter_pass[i].pass = util_bitcount64((queries_mask << clear_bits) >> clear_bits) - 1;
952       assert(counter_pass[i].pass < n_passes);
953    }
954 }
955 
956 /* Accumulate 32bits OA counters */
957 static inline void
accumulate_uint32(const uint32_t * report0,const uint32_t * report1,uint64_t * accumulator)958 accumulate_uint32(const uint32_t *report0,
959                   const uint32_t *report1,
960                   uint64_t *accumulator)
961 {
962    *accumulator += (uint32_t)(*report1 - *report0);
963 }
964 
965 /* Accumulate 40bits OA counters */
966 static inline void
accumulate_uint40(int a_index,const uint32_t * report0,const uint32_t * report1,uint64_t * accumulator)967 accumulate_uint40(int a_index,
968                   const uint32_t *report0,
969                   const uint32_t *report1,
970                   uint64_t *accumulator)
971 {
972    const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
973    const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
974    uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
975    uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
976    uint64_t value0 = report0[a_index + 4] | high0;
977    uint64_t value1 = report1[a_index + 4] | high1;
978    uint64_t delta;
979 
980    if (value0 > value1)
981       delta = (1ULL << 40) + value1 - value0;
982    else
983       delta = value1 - value0;
984 
985    *accumulator += delta;
986 }
987 
988 static void
gfx8_read_report_clock_ratios(const uint32_t * report,uint64_t * slice_freq_hz,uint64_t * unslice_freq_hz)989 gfx8_read_report_clock_ratios(const uint32_t *report,
990                               uint64_t *slice_freq_hz,
991                               uint64_t *unslice_freq_hz)
992 {
993    /* The lower 16bits of the RPT_ID field of the OA reports contains a
994     * snapshot of the bits coming from the RP_FREQ_NORMAL register and is
995     * divided this way :
996     *
997     * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
998     * RPT_ID[10:9]:  RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
999     * RPT_ID[8:0]:   RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
1000     *
1001     * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
1002     *                        Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
1003     *
1004     * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
1005     *                        Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
1006     */
1007 
1008    uint32_t unslice_freq = report[0] & 0x1ff;
1009    uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
1010    uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
1011    uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
1012 
1013    *slice_freq_hz = slice_freq * 16666667ULL;
1014    *unslice_freq_hz = unslice_freq * 16666667ULL;
1015 }
1016 
1017 void
intel_perf_query_result_read_frequencies(struct intel_perf_query_result * result,const struct intel_device_info * devinfo,const uint32_t * start,const uint32_t * end)1018 intel_perf_query_result_read_frequencies(struct intel_perf_query_result *result,
1019                                          const struct intel_device_info *devinfo,
1020                                          const uint32_t *start,
1021                                          const uint32_t *end)
1022 {
1023    /* Slice/Unslice frequency is only available in the OA reports when the
1024     * "Disable OA reports due to clock ratio change" field in
1025     * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
1026     * global register (see drivers/gpu/drm/i915/i915_perf.c)
1027     *
1028     * Documentation says this should be available on Gfx9+ but experimentation
1029     * shows that Gfx8 reports similar values, so we enable it there too.
1030     */
1031    if (devinfo->ver < 8)
1032       return;
1033 
1034    gfx8_read_report_clock_ratios(start,
1035                                  &result->slice_frequency[0],
1036                                  &result->unslice_frequency[0]);
1037    gfx8_read_report_clock_ratios(end,
1038                                  &result->slice_frequency[1],
1039                                  &result->unslice_frequency[1]);
1040 }
1041 
1042 static inline bool
can_use_mi_rpc_bc_counters(const struct intel_device_info * devinfo)1043 can_use_mi_rpc_bc_counters(const struct intel_device_info *devinfo)
1044 {
1045    return devinfo->ver <= 11;
1046 }
1047 
1048 uint64_t
intel_perf_report_timestamp(const struct intel_perf_query_info * query,const uint32_t * report)1049 intel_perf_report_timestamp(const struct intel_perf_query_info *query,
1050                             const uint32_t *report)
1051 {
1052    return report[1] >> query->perf->oa_timestamp_shift;
1053 }
1054 
1055 void
intel_perf_query_result_accumulate(struct intel_perf_query_result * result,const struct intel_perf_query_info * query,const uint32_t * start,const uint32_t * end)1056 intel_perf_query_result_accumulate(struct intel_perf_query_result *result,
1057                                    const struct intel_perf_query_info *query,
1058                                    const uint32_t *start,
1059                                    const uint32_t *end)
1060 {
1061    int i;
1062 
1063    if (result->hw_id == INTEL_PERF_INVALID_CTX_ID &&
1064        start[2] != INTEL_PERF_INVALID_CTX_ID)
1065       result->hw_id = start[2];
1066    if (result->reports_accumulated == 0)
1067       result->begin_timestamp = intel_perf_report_timestamp(query, start);
1068    result->end_timestamp = intel_perf_report_timestamp(query, end);
1069    result->reports_accumulated++;
1070 
1071    switch (query->oa_format) {
1072    case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
1073       result->accumulator[query->gpu_time_offset] =
1074          intel_perf_report_timestamp(query, end) -
1075          intel_perf_report_timestamp(query, start);
1076 
1077       accumulate_uint32(start + 3, end + 3,
1078                         result->accumulator + query->gpu_clock_offset); /* clock */
1079 
1080       /* 32x 40bit A counters... */
1081       for (i = 0; i < 32; i++) {
1082          accumulate_uint40(i, start, end,
1083                            result->accumulator + query->a_offset + i);
1084       }
1085 
1086       /* 4x 32bit A counters... */
1087       for (i = 0; i < 4; i++) {
1088          accumulate_uint32(start + 36 + i, end + 36 + i,
1089                            result->accumulator + query->a_offset + 32 + i);
1090       }
1091 
1092       if (can_use_mi_rpc_bc_counters(&query->perf->devinfo)) {
1093          /* 8x 32bit B counters */
1094          for (i = 0; i < 8; i++) {
1095             accumulate_uint32(start + 48 + i, end + 48 + i,
1096                               result->accumulator + query->b_offset + i);
1097          }
1098 
1099          /* 8x 32bit C counters... */
1100          for (i = 0; i < 8; i++) {
1101             accumulate_uint32(start + 56 + i, end + 56 + i,
1102                               result->accumulator + query->c_offset + i);
1103          }
1104       }
1105       break;
1106 
1107    case I915_OA_FORMAT_A45_B8_C8:
1108       result->accumulator[query->gpu_time_offset] =
1109          intel_perf_report_timestamp(query, end) -
1110          intel_perf_report_timestamp(query, start);
1111 
1112       for (i = 0; i < 61; i++) {
1113          accumulate_uint32(start + 3 + i, end + 3 + i,
1114                            result->accumulator + query->a_offset + i);
1115       }
1116       break;
1117 
1118    default:
1119       unreachable("Can't accumulate OA counters in unknown format");
1120    }
1121 
1122 }
1123 
1124 #define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
1125 
1126 void
intel_perf_query_result_read_gt_frequency(struct intel_perf_query_result * result,const struct intel_device_info * devinfo,const uint32_t start,const uint32_t end)1127 intel_perf_query_result_read_gt_frequency(struct intel_perf_query_result *result,
1128                                           const struct intel_device_info *devinfo,
1129                                           const uint32_t start,
1130                                           const uint32_t end)
1131 {
1132    switch (devinfo->ver) {
1133    case 7:
1134    case 8:
1135       result->gt_frequency[0] = GET_FIELD(start, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
1136       result->gt_frequency[1] = GET_FIELD(end, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
1137       break;
1138    case 9:
1139    case 11:
1140    case 12:
1141       result->gt_frequency[0] = GET_FIELD(start, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
1142       result->gt_frequency[1] = GET_FIELD(end, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
1143       break;
1144    default:
1145       unreachable("unexpected gen");
1146    }
1147 
1148    /* Put the numbers into Hz. */
1149    result->gt_frequency[0] *= 1000000ULL;
1150    result->gt_frequency[1] *= 1000000ULL;
1151 }
1152 
1153 void
intel_perf_query_result_read_perfcnts(struct intel_perf_query_result * result,const struct intel_perf_query_info * query,const uint64_t * start,const uint64_t * end)1154 intel_perf_query_result_read_perfcnts(struct intel_perf_query_result *result,
1155                                       const struct intel_perf_query_info *query,
1156                                       const uint64_t *start,
1157                                       const uint64_t *end)
1158 {
1159    for (uint32_t i = 0; i < 2; i++) {
1160       uint64_t v0 = start[i] & PERF_CNT_VALUE_MASK;
1161       uint64_t v1 = end[i] & PERF_CNT_VALUE_MASK;
1162 
1163       result->accumulator[query->perfcnt_offset + i] = v0 > v1 ?
1164          (PERF_CNT_VALUE_MASK + 1 + v1 - v0) :
1165          (v1 - v0);
1166    }
1167 }
1168 
1169 static uint32_t
query_accumulator_offset(const struct intel_perf_query_info * query,enum intel_perf_query_field_type type,uint8_t index)1170 query_accumulator_offset(const struct intel_perf_query_info *query,
1171                          enum intel_perf_query_field_type type,
1172                          uint8_t index)
1173 {
1174    switch (type) {
1175    case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1176       return query->perfcnt_offset + index;
1177    case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
1178       return query->a_offset + index;
1179    case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1180       return query->b_offset + index;
1181    case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1182       return query->c_offset + index;
1183    default:
1184       unreachable("Invalid register type");
1185       return 0;
1186    }
1187 }
1188 
1189 void
intel_perf_query_result_accumulate_fields(struct intel_perf_query_result * result,const struct intel_perf_query_info * query,const void * start,const void * end,bool no_oa_accumulate)1190 intel_perf_query_result_accumulate_fields(struct intel_perf_query_result *result,
1191                                           const struct intel_perf_query_info *query,
1192                                           const void *start,
1193                                           const void *end,
1194                                           bool no_oa_accumulate)
1195 {
1196    const struct intel_perf_query_field_layout *layout = &query->perf->query_layout;
1197    const struct intel_device_info *devinfo = &query->perf->devinfo;
1198 
1199    for (uint32_t r = 0; r < layout->n_fields; r++) {
1200       const struct intel_perf_query_field *field = &layout->fields[r];
1201 
1202       if (field->type == INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC) {
1203          intel_perf_query_result_read_frequencies(result, devinfo,
1204                                                 start + field->location,
1205                                                 end + field->location);
1206          /* no_oa_accumulate=true is used when doing GL perf queries, we
1207           * manually parse the OA reports from the OA buffer and subtract
1208           * unrelated deltas, so don't accumulate the begin/end reports here.
1209           */
1210          if (!no_oa_accumulate) {
1211             intel_perf_query_result_accumulate(result, query,
1212                                                start + field->location,
1213                                                end + field->location);
1214          }
1215       } else {
1216          uint64_t v0, v1;
1217 
1218          if (field->size == 4) {
1219             v0 = *(const uint32_t *)(start + field->location);
1220             v1 = *(const uint32_t *)(end + field->location);
1221          } else {
1222             assert(field->size == 8);
1223             v0 = *(const uint64_t *)(start + field->location);
1224             v1 = *(const uint64_t *)(end + field->location);
1225          }
1226 
1227          if (field->mask) {
1228             v0 = field->mask & v0;
1229             v1 = field->mask & v1;
1230          }
1231 
1232          /* RPSTAT is a bit of a special case because its begin/end values
1233           * represent frequencies. We store it in a separate location.
1234           */
1235          if (field->type == INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT)
1236             intel_perf_query_result_read_gt_frequency(result, devinfo, v0, v1);
1237          else
1238             result->accumulator[query_accumulator_offset(query, field->type, field->index)] = v1 - v0;
1239       }
1240    }
1241 }
1242 
1243 void
intel_perf_query_result_clear(struct intel_perf_query_result * result)1244 intel_perf_query_result_clear(struct intel_perf_query_result *result)
1245 {
1246    memset(result, 0, sizeof(*result));
1247    result->hw_id = INTEL_PERF_INVALID_CTX_ID;
1248 }
1249 
1250 void
intel_perf_query_result_print_fields(const struct intel_perf_query_info * query,const void * data)1251 intel_perf_query_result_print_fields(const struct intel_perf_query_info *query,
1252                                      const void *data)
1253 {
1254    const struct intel_perf_query_field_layout *layout = &query->perf->query_layout;
1255 
1256    for (uint32_t r = 0; r < layout->n_fields; r++) {
1257       const struct intel_perf_query_field *field = &layout->fields[r];
1258       const uint32_t *value32 = data + field->location;
1259 
1260       switch (field->type) {
1261       case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
1262          fprintf(stderr, "MI_RPC:\n");
1263          fprintf(stderr, "  TS: 0x%08x\n", *(value32 + 1));
1264          fprintf(stderr, "  CLK: 0x%08x\n", *(value32 + 3));
1265          break;
1266       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
1267          fprintf(stderr, "A%u: 0x%08x\n", field->index, *value32);
1268          break;
1269       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1270          fprintf(stderr, "B%u: 0x%08x\n", field->index, *value32);
1271          break;
1272       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1273          fprintf(stderr, "C%u: 0x%08x\n", field->index, *value32);
1274          break;
1275       default:
1276          break;
1277       }
1278    }
1279 }
1280 
1281 static int
intel_perf_compare_query_names(const void * v1,const void * v2)1282 intel_perf_compare_query_names(const void *v1, const void *v2)
1283 {
1284    const struct intel_perf_query_info *q1 = v1;
1285    const struct intel_perf_query_info *q2 = v2;
1286 
1287    return strcmp(q1->name, q2->name);
1288 }
1289 
1290 static inline struct intel_perf_query_field *
add_query_register(struct intel_perf_query_field_layout * layout,enum intel_perf_query_field_type type,uint16_t offset,uint16_t size,uint8_t index)1291 add_query_register(struct intel_perf_query_field_layout *layout,
1292                    enum intel_perf_query_field_type type,
1293                    uint16_t offset,
1294                    uint16_t size,
1295                    uint8_t index)
1296 {
1297    /* Align MI_RPC to 64bytes (HW requirement) & 64bit registers to 8bytes
1298     * (shows up nicely in the debugger).
1299     */
1300    if (type == INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC)
1301       layout->size = align(layout->size, 64);
1302    else if (size % 8 == 0)
1303       layout->size = align(layout->size, 8);
1304 
1305    layout->fields[layout->n_fields++] = (struct intel_perf_query_field) {
1306       .mmio_offset = offset,
1307       .location = layout->size,
1308       .type = type,
1309       .index = index,
1310       .size = size,
1311    };
1312    layout->size += size;
1313 
1314    return &layout->fields[layout->n_fields - 1];
1315 }
1316 
1317 static void
intel_perf_init_query_fields(struct intel_perf_config * perf_cfg,const struct intel_device_info * devinfo,bool use_register_snapshots)1318 intel_perf_init_query_fields(struct intel_perf_config *perf_cfg,
1319                              const struct intel_device_info *devinfo,
1320                              bool use_register_snapshots)
1321 {
1322    struct intel_perf_query_field_layout *layout = &perf_cfg->query_layout;
1323 
1324    layout->n_fields = 0;
1325 
1326    /* MI_RPC requires a 64byte alignment. */
1327    layout->alignment = 64;
1328 
1329    layout->fields = rzalloc_array(perf_cfg, struct intel_perf_query_field, 5 + 16);
1330 
1331    add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC,
1332                       0, 256, 0);
1333 
1334    if (use_register_snapshots) {
1335       if (devinfo->ver <= 11) {
1336          struct intel_perf_query_field *field =
1337             add_query_register(layout,
1338                                INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT,
1339                                PERF_CNT_1_DW0, 8, 0);
1340          field->mask = PERF_CNT_VALUE_MASK;
1341 
1342          field = add_query_register(layout,
1343                                     INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT,
1344                                     PERF_CNT_2_DW0, 8, 1);
1345          field->mask = PERF_CNT_VALUE_MASK;
1346       }
1347 
1348       if (devinfo->ver == 8 && devinfo->platform != INTEL_PLATFORM_CHV) {
1349          add_query_register(layout,
1350                          INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT,
1351                             GFX7_RPSTAT1, 4, 0);
1352       }
1353 
1354       if (devinfo->ver >= 9) {
1355          add_query_register(layout,
1356                             INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT,
1357                             GFX9_RPSTAT0, 4, 0);
1358       }
1359 
1360       if (!can_use_mi_rpc_bc_counters(devinfo)) {
1361          if (devinfo->ver >= 8 && devinfo->ver <= 11) {
1362             for (uint32_t i = 0; i < GFX8_N_OA_PERF_B32; i++) {
1363                add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
1364                                   GFX8_OA_PERF_B32(i), 4, i);
1365             }
1366             for (uint32_t i = 0; i < GFX8_N_OA_PERF_C32; i++) {
1367                add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
1368                                   GFX8_OA_PERF_C32(i), 4, i);
1369             }
1370          } else if (devinfo->verx10 == 120) {
1371             for (uint32_t i = 0; i < GFX12_N_OAG_PERF_B32; i++) {
1372                add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
1373                                   GFX12_OAG_PERF_B32(i), 4, i);
1374             }
1375             for (uint32_t i = 0; i < GFX12_N_OAG_PERF_C32; i++) {
1376                add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
1377                                   GFX12_OAG_PERF_C32(i), 4, i);
1378             }
1379          } else if (devinfo->verx10 == 125) {
1380             add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A,
1381                                GFX125_OAG_PERF_A36, 4, 36);
1382             add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A,
1383                                GFX125_OAG_PERF_A37, 4, 37);
1384             for (uint32_t i = 0; i < GFX12_N_OAG_PERF_B32; i++) {
1385                add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
1386                                   GFX12_OAG_PERF_B32(i), 4, i);
1387             }
1388             for (uint32_t i = 0; i < GFX12_N_OAG_PERF_C32; i++) {
1389                add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
1390                                   GFX12_OAG_PERF_C32(i), 4, i);
1391             }
1392          }
1393       }
1394    }
1395 
1396    /* Align the whole package to 64bytes so that 2 snapshots can be put
1397     * together without extract alignment for the user.
1398     */
1399    layout->size = align(layout->size, 64);
1400 }
1401 
1402 void
intel_perf_init_metrics(struct intel_perf_config * perf_cfg,const struct intel_device_info * devinfo,int drm_fd,bool include_pipeline_statistics,bool use_register_snapshots)1403 intel_perf_init_metrics(struct intel_perf_config *perf_cfg,
1404                         const struct intel_device_info *devinfo,
1405                         int drm_fd,
1406                         bool include_pipeline_statistics,
1407                         bool use_register_snapshots)
1408 {
1409    intel_perf_init_query_fields(perf_cfg, devinfo, use_register_snapshots);
1410 
1411    if (include_pipeline_statistics) {
1412       load_pipeline_statistic_metrics(perf_cfg, devinfo);
1413       intel_perf_register_mdapi_statistic_query(perf_cfg, devinfo);
1414    }
1415 
1416    bool oa_metrics = oa_metrics_available(perf_cfg, drm_fd, devinfo,
1417                                           use_register_snapshots);
1418    if (oa_metrics)
1419       load_oa_metrics(perf_cfg, drm_fd, devinfo);
1420 
1421    /* sort query groups by name */
1422    qsort(perf_cfg->queries, perf_cfg->n_queries,
1423          sizeof(perf_cfg->queries[0]), intel_perf_compare_query_names);
1424 
1425    build_unique_counter_list(perf_cfg);
1426 
1427    if (oa_metrics)
1428       intel_perf_register_mdapi_oa_query(perf_cfg, devinfo);
1429 }
1430