• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Google, Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "fd_pps_driver.h"
7 
8 #include <cstring>
9 #include <iostream>
10 #include <perfetto.h>
11 
12 #include "common/freedreno_dev_info.h"
13 #include "drm/freedreno_drmif.h"
14 #include "drm/freedreno_ringbuffer.h"
15 #include "perfcntrs/freedreno_dt.h"
16 #include "perfcntrs/freedreno_perfcntr.h"
17 
18 #include "pps/pps.h"
19 #include "pps/pps_algorithm.h"
20 
21 namespace pps
22 {
23 
24 double
safe_div(uint64_t a,uint64_t b)25 safe_div(uint64_t a, uint64_t b)
26 {
27    if (b == 0)
28       return 0;
29 
30    return a / static_cast<double>(b);
31 }
32 
33 float
percent(uint64_t a,uint64_t b)34 percent(uint64_t a, uint64_t b)
35 {
36    /* Sometimes we get bogus values but we want for the timeline
37     * to look nice without higher than 100% values.
38     */
39    if (b == 0 || a > b)
40       return 0;
41 
42    return 100.f * (a / static_cast<double>(b));
43 }
44 
45 bool
is_dump_perfcnt_preemptible() const46 FreedrenoDriver::is_dump_perfcnt_preemptible() const
47 {
48    return false;
49 }
50 
51 uint64_t
get_min_sampling_period_ns()52 FreedrenoDriver::get_min_sampling_period_ns()
53 {
54    return 100000;
55 }
56 
57 /*
58 TODO this sees like it would be largely the same for a5xx as well
59 (ie. same countable names)..
60  */
61 void
setup_a6xx_counters()62 FreedrenoDriver::setup_a6xx_counters()
63 {
64    /* TODO is there a reason to want more than one group? */
65    CounterGroup group = {};
66    group.name = "counters";
67    groups.clear();
68    counters.clear();
69    countables.clear();
70    enabled_counters.clear();
71    groups.emplace_back(std::move(group));
72 
73    /*
74     * Create the countables that we'll be using.
75     */
76 
77    auto PERF_CP_ALWAYS_COUNT = countable("CP", "PERF_CP_ALWAYS_COUNT");
78    auto PERF_CP_BUSY_CYCLES  = countable("CP", "PERF_CP_BUSY_CYCLES");
79    auto PERF_RB_3D_PIXELS    = countable("RB", "PERF_RB_3D_PIXELS");
80    auto PERF_TP_L1_CACHELINE_MISSES = countable("TP", "PERF_TP_L1_CACHELINE_MISSES");
81    auto PERF_TP_L1_CACHELINE_REQUESTS = countable("TP", "PERF_TP_L1_CACHELINE_REQUESTS");
82 
83    auto PERF_TP_OUTPUT_PIXELS  = countable("TP", "PERF_TP_OUTPUT_PIXELS");
84    auto PERF_TP_OUTPUT_PIXELS_ANISO  = countable("TP", "PERF_TP_OUTPUT_PIXELS_ANISO");
85    auto PERF_TP_OUTPUT_PIXELS_BILINEAR = countable("TP", "PERF_TP_OUTPUT_PIXELS_BILINEAR");
86    auto PERF_TP_OUTPUT_PIXELS_POINT = countable("TP", "PERF_TP_OUTPUT_PIXELS_POINT");
87    auto PERF_TP_OUTPUT_PIXELS_ZERO_LOD = countable("TP", "PERF_TP_OUTPUT_PIXELS_ZERO_LOD");
88 
89    auto PERF_TSE_INPUT_PRIM  = countable("TSE", "PERF_TSE_INPUT_PRIM");
90    auto PERF_TSE_CLIPPED_PRIM  = countable("TSE", "PERF_TSE_CLIPPED_PRIM");
91    auto PERF_TSE_TRIVAL_REJ_PRIM  = countable("TSE", "PERF_TSE_TRIVAL_REJ_PRIM");
92    auto PERF_TSE_OUTPUT_VISIBLE_PRIM = countable("TSE", "PERF_TSE_OUTPUT_VISIBLE_PRIM");
93 
94    auto PERF_SP_BUSY_CYCLES  = countable("SP", "PERF_SP_BUSY_CYCLES");
95    auto PERF_SP_ALU_WORKING_CYCLES = countable("SP", "PERF_SP_ALU_WORKING_CYCLES");
96    auto PERF_SP_EFU_WORKING_CYCLES = countable("SP", "PERF_SP_EFU_WORKING_CYCLES");
97    auto PERF_SP_VS_STAGE_EFU_INSTRUCTIONS = countable("SP", "PERF_SP_VS_STAGE_EFU_INSTRUCTIONS");
98    auto PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS");
99    auto PERF_SP_VS_STAGE_TEX_INSTRUCTIONS = countable("SP", "PERF_SP_VS_STAGE_TEX_INSTRUCTIONS");
100    auto PERF_SP_FS_STAGE_EFU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_EFU_INSTRUCTIONS");
101    auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS");
102    auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS");
103    auto PERF_SP_STALL_CYCLES_TP = countable("SP", "PERF_SP_STALL_CYCLES_TP");
104    auto PERF_SP_ANY_EU_WORKING_FS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_FS_STAGE");
105    auto PERF_SP_ANY_EU_WORKING_VS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_VS_STAGE");
106    auto PERF_SP_ANY_EU_WORKING_CS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_CS_STAGE");
107 
108    auto PERF_UCHE_STALL_CYCLES_ARBITER = countable("UCHE", "PERF_UCHE_STALL_CYCLES_ARBITER");
109    auto PERF_UCHE_VBIF_READ_BEATS_TP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_TP");
110    auto PERF_UCHE_VBIF_READ_BEATS_VFD = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_VFD");
111    auto PERF_UCHE_VBIF_READ_BEATS_SP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_SP");
112    auto PERF_UCHE_READ_REQUESTS_TP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_TP");
113 
114    auto PERF_PC_STALL_CYCLES_VFD = countable("PC", "PERF_PC_STALL_CYCLES_VFD");
115    auto PERF_PC_VS_INVOCATIONS = countable("PC", "PERF_PC_VS_INVOCATIONS");
116    auto PERF_PC_VERTEX_HITS = countable("PC", "PERF_PC_VERTEX_HITS");
117 
118    auto PERF_HLSQ_QUADS = countable("HLSQ", "PERF_HLSQ_QUADS"); /* Quads (fragments / 4) produced */
119 
120    auto PERF_CP_NUM_PREEMPTIONS = countable("CP", "PERF_CP_NUM_PREEMPTIONS");
121    auto PERF_CP_PREEMPTION_REACTION_DELAY = countable("CP", "PERF_CP_PREEMPTION_REACTION_DELAY");
122 
123    /* TODO: resolve() tells there is no PERF_CMPDECMP_VBIF_READ_DATA */
124    // auto PERF_CMPDECMP_VBIF_READ_DATA = countable("PERF_CMPDECMP_VBIF_READ_DATA");
125 
126    /*
127     * And then setup the derived counters that we are exporting to
128     * pps based on the captured countable values.
129     *
130     * We try to expose the same counters as blob:
131     * https://gpuinspector.dev/docs/gpu-counters/qualcomm
132     */
133 
134    counter("GPU Frequency", Counter::Units::Hertz, [=]() {
135          return PERF_CP_ALWAYS_COUNT / time;
136       }
137    );
138 
139    counter("GPU % Utilization", Counter::Units::Percent, [=]() {
140          return percent(PERF_CP_BUSY_CYCLES / time, max_freq);
141       }
142    );
143 
144    counter("TP L1 Cache Misses", Counter::Units::None, [=]() {
145          return PERF_TP_L1_CACHELINE_MISSES / time;
146       }
147    );
148 
149    counter("Shader Core Utilization", Counter::Units::Percent, [=]() {
150          return percent(PERF_SP_BUSY_CYCLES / time, max_freq * info->num_sp_cores);
151       }
152    );
153 
154    /* TODO: verify */
155    counter("(?) % Texture Fetch Stall", Counter::Units::Percent, [=]() {
156          return percent(PERF_SP_STALL_CYCLES_TP / time, max_freq * info->num_sp_cores);
157       }
158    );
159 
160    /* TODO: verify */
161    counter("(?) % Vertex Fetch Stall", Counter::Units::Percent, [=]() {
162          return percent(PERF_PC_STALL_CYCLES_VFD / time, max_freq * info->num_sp_cores);
163       }
164    );
165 
166    counter("L1 Texture Cache Miss Per Pixel", Counter::Units::None, [=]() {
167          return safe_div(PERF_TP_L1_CACHELINE_MISSES, PERF_HLSQ_QUADS * 4);
168       }
169    );
170 
171    counter("% Texture L1 Miss", Counter::Units::Percent, [=]() {
172          return percent(PERF_TP_L1_CACHELINE_MISSES, PERF_TP_L1_CACHELINE_REQUESTS);
173       }
174    );
175 
176    counter("% Texture L2 Miss", Counter::Units::Percent, [=]() {
177          return percent(PERF_UCHE_VBIF_READ_BEATS_TP / 2, PERF_UCHE_READ_REQUESTS_TP);
178       }
179    );
180 
181    /* TODO: verify */
182    counter("(?) % Stalled on System Memory", Counter::Units::Percent, [=]() {
183          return percent(PERF_UCHE_STALL_CYCLES_ARBITER / time, max_freq * info->num_sp_cores);
184       }
185    );
186 
187    counter("Pre-clipped Polygons / Second", Counter::Units::None, [=]() {
188          return PERF_TSE_INPUT_PRIM * (1.f / time);
189       }
190    );
191 
192    counter("% Prims Trivially Rejected", Counter::Units::Percent, [=]() {
193          return percent(PERF_TSE_TRIVAL_REJ_PRIM, PERF_TSE_INPUT_PRIM);
194       }
195    );
196 
197    counter("% Prims Clipped", Counter::Units::Percent, [=]() {
198          return percent(PERF_TSE_CLIPPED_PRIM, PERF_TSE_INPUT_PRIM);
199       }
200    );
201 
202    counter("Average Vertices / Polygon", Counter::Units::None, [=]() {
203          return PERF_PC_VS_INVOCATIONS / PERF_TSE_INPUT_PRIM;
204       }
205    );
206 
207    counter("Reused Vertices / Second", Counter::Units::None, [=]() {
208          return PERF_PC_VERTEX_HITS * (1.f / time);
209       }
210    );
211 
212    counter("Average Polygon Area", Counter::Units::None, [=]() {
213          return safe_div(PERF_HLSQ_QUADS * 4, PERF_TSE_OUTPUT_VISIBLE_PRIM);
214       }
215    );
216 
217    /* TODO: find formula */
218    // counter("% Shaders Busy", Counter::Units::Percent, [=]() {
219    //       return 100.0 * 0;
220    //    }
221    // );
222 
223    counter("Vertices Shaded / Second", Counter::Units::None, [=]() {
224          return PERF_PC_VS_INVOCATIONS * (1.f / time);
225       }
226    );
227 
228    counter("Fragments Shaded / Second", Counter::Units::None, [=]() {
229          return PERF_HLSQ_QUADS * 4 * (1.f / time);
230       }
231    );
232 
233    counter("Vertex Instructions / Second", Counter::Units::None, [=]() {
234          return (PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS +
235                  PERF_SP_VS_STAGE_EFU_INSTRUCTIONS) * (1.f / time);
236       }
237    );
238 
239    counter("Fragment Instructions / Second", Counter::Units::None, [=]() {
240          return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
241                  PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2 +
242                  PERF_SP_FS_STAGE_EFU_INSTRUCTIONS) * (1.f / time);
243       }
244    );
245 
246    counter("Fragment ALU Instructions / Sec (Full)", Counter::Units::None, [=]() {
247          return PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS * (1.f / time);
248       }
249    );
250 
251    counter("Fragment ALU Instructions / Sec (Half)", Counter::Units::None, [=]() {
252          return PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS * (1.f / time);
253       }
254    );
255 
256    counter("Fragment EFU Instructions / Second", Counter::Units::None, [=]() {
257          return PERF_SP_FS_STAGE_EFU_INSTRUCTIONS * (1.f / time);
258       }
259    );
260 
261    counter("Textures / Vertex", Counter::Units::None, [=]() {
262          return safe_div(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS);
263       }
264    );
265 
266    counter("Textures / Fragment", Counter::Units::None, [=]() {
267          return safe_div(PERF_TP_OUTPUT_PIXELS, PERF_HLSQ_QUADS * 4);
268       }
269    );
270 
271    counter("ALU / Vertex", Counter::Units::None, [=]() {
272          return safe_div(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS);
273       }
274    );
275 
276    counter("EFU / Vertex", Counter::Units::None, [=]() {
277          return safe_div(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS);
278       }
279    );
280 
281    counter("ALU / Fragment", Counter::Units::None, [=]() {
282          return safe_div(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
283                          PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2, PERF_HLSQ_QUADS);
284       }
285    );
286 
287    counter("EFU / Fragment", Counter::Units::None, [=]() {
288          return safe_div(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, PERF_HLSQ_QUADS);
289       }
290    );
291 
292    counter("% Time Shading Vertices", Counter::Units::Percent, [=]() {
293          return percent(PERF_SP_ANY_EU_WORKING_VS_STAGE,
294                         (PERF_SP_ANY_EU_WORKING_VS_STAGE +
295                          PERF_SP_ANY_EU_WORKING_FS_STAGE +
296                          PERF_SP_ANY_EU_WORKING_CS_STAGE));
297       }
298    );
299 
300    counter("% Time Shading Fragments", Counter::Units::Percent, [=]() {
301          return percent(PERF_SP_ANY_EU_WORKING_FS_STAGE,
302                         (PERF_SP_ANY_EU_WORKING_VS_STAGE +
303                          PERF_SP_ANY_EU_WORKING_FS_STAGE +
304                          PERF_SP_ANY_EU_WORKING_CS_STAGE));
305       }
306    );
307 
308    counter("% Time Compute", Counter::Units::Percent, [=]() {
309          return percent(PERF_SP_ANY_EU_WORKING_CS_STAGE,
310                         (PERF_SP_ANY_EU_WORKING_VS_STAGE +
311                          PERF_SP_ANY_EU_WORKING_FS_STAGE +
312                          PERF_SP_ANY_EU_WORKING_CS_STAGE));
313       }
314    );
315 
316    counter("% Shader ALU Capacity Utilized", Counter::Units::Percent, [=]() {
317          return percent((PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS +
318                          PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
319                          PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2) / 64,
320                         PERF_SP_BUSY_CYCLES);
321       }
322    );
323 
324    counter("% Time ALUs Working", Counter::Units::Percent, [=]() {
325          return percent(PERF_SP_ALU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES);
326       }
327    );
328 
329    counter("% Time EFUs Working", Counter::Units::Percent, [=]() {
330          return percent(PERF_SP_EFU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES);
331       }
332    );
333 
334    counter("% Anisotropic Filtered", Counter::Units::Percent, [=]() {
335          return percent(PERF_TP_OUTPUT_PIXELS_ANISO, PERF_TP_OUTPUT_PIXELS);
336       }
337    );
338 
339    counter("% Linear Filtered", Counter::Units::Percent, [=]() {
340          return percent(PERF_TP_OUTPUT_PIXELS_BILINEAR, PERF_TP_OUTPUT_PIXELS);
341       }
342    );
343 
344    counter("% Nearest Filtered", Counter::Units::Percent, [=]() {
345          return percent(PERF_TP_OUTPUT_PIXELS_POINT, PERF_TP_OUTPUT_PIXELS);
346       }
347    );
348 
349    counter("% Non-Base Level Textures", Counter::Units::Percent, [=]() {
350          return percent(PERF_TP_OUTPUT_PIXELS_ZERO_LOD, PERF_TP_OUTPUT_PIXELS);
351       }
352    );
353 
354    /* Reads from KGSL_PERFCOUNTER_GROUP_VBIF countable=63 */
355    // counter("Read Total (Bytes/sec)", Counter::Units::Byte, [=]() {
356    //       return  * (1.f / time);
357    //    }
358    // );
359 
360    /* Reads from KGSL_PERFCOUNTER_GROUP_VBIF countable=84 */
361    // counter("Write Total (Bytes/sec)", Counter::Units::Byte, [=]() {
362    //       return  * (1.f / time);
363    //    }
364    // );
365 
366    /* Cannot get PERF_CMPDECMP_VBIF_READ_DATA countable */
367    // counter("Texture Memory Read BW (Bytes/Second)", Counter::Units::Byte, [=]() {
368    //       return (PERF_CMPDECMP_VBIF_READ_DATA + PERF_UCHE_VBIF_READ_BEATS_TP) * (1.f / time);
369    //    }
370    // );
371 
372    /* TODO: verify */
373    counter("(?) Vertex Memory Read (Bytes/Second)", Counter::Units::Byte, [=]() {
374          return PERF_UCHE_VBIF_READ_BEATS_VFD * 32 * (1.f / time);
375       }
376    );
377 
378    /* TODO: verify */
379    counter("SP Memory Read (Bytes/Second)", Counter::Units::Byte, [=]() {
380          return PERF_UCHE_VBIF_READ_BEATS_SP * 32 * (1.f / time);
381       }
382    );
383 
384    counter("Avg Bytes / Fragment", Counter::Units::Byte, [=]() {
385          return safe_div(PERF_UCHE_VBIF_READ_BEATS_TP * 32, PERF_HLSQ_QUADS * 4);
386       }
387    );
388 
389    counter("Avg Bytes / Vertex", Counter::Units::Byte, [=]() {
390          return safe_div(PERF_UCHE_VBIF_READ_BEATS_VFD * 32, PERF_PC_VS_INVOCATIONS);
391       }
392    );
393 
394    counter("Preemptions / second", Counter::Units::None, [=]() {
395          return PERF_CP_NUM_PREEMPTIONS * (1.f / time);
396       }
397    );
398 
399    counter("Avg Preemption Delay", Counter::Units::None, [=]() {
400          return PERF_CP_PREEMPTION_REACTION_DELAY * (1.f / time);
401       }
402    );
403 }
404 
405 void
setup_a7xx_counters()406 FreedrenoDriver::setup_a7xx_counters()
407 {
408    /* TODO is there a reason to want more than one group? */
409    CounterGroup group = {};
410    group.name = "counters";
411    groups.clear();
412    counters.clear();
413    countables.clear();
414    enabled_counters.clear();
415    groups.emplace_back(std::move(group));
416 
417    /* So far, all a7xx devices seem to have two uSPTPs in each SP core
418     * and 128 ALUs in each uSPTP.
419     */
420    const unsigned number_of_usptp = info->num_sp_cores * 2;
421    const unsigned number_of_alus_per_usptp = 128;
422 
423    /* The enumeration and two helper lambdas serve to handle countables
424     * that can be sampled from either rendering or visibility bins.
425     */
426    enum {
427       BR = 0,
428       BV = 1,
429    };
430 
431    auto cbCountable = [=](std::string group, std::string name) {
432       return std::array<Countable, 2> {
433          countable(group, name),
434          countable("BV_" + group, name),
435       };
436    };
437 
438    auto cbSum = [](const std::array<Countable, 2>& countable) {
439       return countable[BR] + countable[BV];
440    };
441 
442    /* This is a helper no-op lambda to handle known and understood counters
443     * that we can't currently implement for a variety of reasons.
444     */
445    auto disabledCounter = [](std::string, Counter::Units, std::function<int64_t()>) { };
446 
447    /* CP: 3/14 counters */
448    auto PERF_CP_ALWAYS_COUNT = countable("CP", "PERF_CP_ALWAYS_COUNT");
449    auto PERF_CP_NUM_PREEMPTIONS = countable("CP", "PERF_CP_NUM_PREEMPTIONS");
450    auto PERF_CP_PREEMPTION_REACTION_DELAY = countable("CP", "PERF_CP_PREEMPTION_REACTION_DELAY");
451 
452    /* RBBM: 1/4 counters */
453    auto PERF_RBBM_STATUS_MASKED = countable("RBBM", "PERF_RBBM_STATUS_MASKED");
454 
455    /* PC: 3/8 counters, BV_PC: 3/8 counters */
456    auto PERF_PC_STALL_CYCLES_VFD = cbCountable("PC", "PERF_PC_STALL_CYCLES_VFD");
457    auto PERF_PC_VERTEX_HITS = cbCountable("PC", "PERF_PC_VERTEX_HITS");
458    auto PERF_PC_VS_INVOCATIONS = cbCountable("PC", "PERF_PC_VS_INVOCATIONS");
459 
460    /* TSE: 4/8 counters */
461    auto PERF_TSE_INPUT_PRIM = countable("TSE", "PERF_TSE_INPUT_PRIM");
462    auto PERF_TSE_TRIVAL_REJ_PRIM = countable("TSE", "PERF_TSE_TRIVAL_REJ_PRIM");
463    auto PERF_TSE_CLIPPED_PRIM = countable("TSE", "PERF_TSE_CLIPPED_PRIM");
464    auto PERF_TSE_OUTPUT_VISIBLE_PRIM = countable("TSE", "PERF_TSE_OUTPUT_VISIBLE_PRIM");
465 
466    /* UCHE: 8/12 counters */
467    auto PERF_UCHE_STALL_CYCLES_ARBITER = countable("UCHE", "PERF_UCHE_STALL_CYCLES_ARBITER");
468    auto PERF_UCHE_VBIF_READ_BEATS_TP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_TP");
469    auto PERF_UCHE_VBIF_READ_BEATS_VFD = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_VFD");
470    auto PERF_UCHE_VBIF_READ_BEATS_SP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_SP");
471    auto PERF_UCHE_READ_REQUESTS_TP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_TP");
472    auto PERF_UCHE_READ_REQUESTS_SP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_SP");
473    auto PERF_UCHE_WRITE_REQUESTS_SP = countable("UCHE", "PERF_UCHE_WRITE_REQUESTS_SP");
474    auto PERF_UCHE_EVICTS = countable("UCHE", "PERF_UCHE_EVICTS");
475 
476    /* TP: 7/12 counters, BV_TP: 6/6 counters */
477    auto PERF_TP_BUSY_CYCLES = countable("TP", "PERF_TP_BUSY_CYCLES");
478    auto PERF_TP_L1_CACHELINE_REQUESTS = cbCountable("TP", "PERF_TP_L1_CACHELINE_REQUESTS");
479    auto PERF_TP_L1_CACHELINE_MISSES = cbCountable("TP", "PERF_TP_L1_CACHELINE_MISSES");
480    auto PERF_TP_OUTPUT_PIXELS = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS");
481    auto PERF_TP_OUTPUT_PIXELS_POINT = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_POINT");
482    auto PERF_TP_OUTPUT_PIXELS_BILINEAR = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_BILINEAR");
483    auto PERF_TP_OUTPUT_PIXELS_ANISO = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_ANISO");
484 
485    /* SP: 24/24 counters, BV_SP: 7/12 counters */
486    auto PERF_SP_BUSY_CYCLES = countable("SP", "PERF_SP_BUSY_CYCLES");
487    auto PERF_SP_ALU_WORKING_CYCLES = countable("SP", "PERF_SP_ALU_WORKING_CYCLES");
488    auto PERF_SP_EFU_WORKING_CYCLES = countable("SP", "PERF_SP_EFU_WORKING_CYCLES");
489    auto PERF_SP_STALL_CYCLES_TP = cbCountable("SP", "PERF_SP_STALL_CYCLES_TP");
490    auto PERF_SP_NON_EXECUTION_CYCLES = countable("SP", "PERF_SP_NON_EXECUTION_CYCLES");
491    auto PERF_SP_VS_STAGE_TEX_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_TEX_INSTRUCTIONS");
492    auto PERF_SP_VS_STAGE_EFU_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_EFU_INSTRUCTIONS");
493    auto PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS");
494    auto PERF_SP_FS_STAGE_EFU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_EFU_INSTRUCTIONS");
495    auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS");
496    auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS");
497    auto PERF_SP_ICL1_REQUESTS = cbCountable("SP", "PERF_SP_ICL1_REQUESTS");
498    auto PERF_SP_ICL1_MISSES = cbCountable("SP", "PERF_SP_ICL1_MISSES");
499    auto PERF_SP_ANY_EU_WORKING_FS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_FS_STAGE");
500    auto PERF_SP_ANY_EU_WORKING_VS_STAGE = cbCountable("SP", "PERF_SP_ANY_EU_WORKING_VS_STAGE");
501    auto PERF_SP_ANY_EU_WORKING_CS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_CS_STAGE");
502    auto PERF_SP_PIXELS = countable("SP", "PERF_SP_PIXELS");
503    auto PERF_SP_RAY_QUERY_INSTRUCTIONS = countable("SP", "PERF_SP_RAY_QUERY_INSTRUCTIONS");
504    auto PERF_SP_RTU_BUSY_CYCLES = countable("SP", "PERF_SP_RTU_BUSY_CYCLES");
505    auto PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES = countable("SP", "PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES");
506    auto PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES = countable("SP", "PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES");
507    auto PERF_SP_RTU_RAY_BOX_INTERSECTIONS = countable("SP", "PERF_SP_RTU_RAY_BOX_INTERSECTIONS");
508    auto PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS = countable("SP", "PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS");
509    auto PERF_SP_SCH_STALL_CYCLES_RTU = countable("SP", "PERF_SP_SCH_STALL_CYCLES_RTU");
510 
511    /* CMP: 1/4 counters */
512    auto PERF_CMPDECMP_VBIF_READ_DATA = countable("CMP", "PERF_CMPDECMP_VBIF_READ_DATA");
513 
514    /**
515     * GPU Compute
516     */
517    disabledCounter("Avg Load-Store Instructions Per Cycle", Counter::Units::None, [=]() {
518          /* Number of average Load-Store instructions per cycle. */
519          /* Countables:
520           * PERFCOUNTER_GROUP_SP::COUNTABLE_27 = PERF_SP_LM_LOAD_INSTRUCTIONS
521           * PERFCOUNTER_GROUP_SP::COUNTABLE_28 = PERF_SP_LM_STORE_INSTRUCTIONS
522           * PERFCOUNTER_GROUP_SP::COUNTABLE_30 = PERF_SP_GM_LOAD_INSTRUCTIONS
523           * PERFCOUNTER_GROUP_SP::COUNTABLE_31 = PERF_SP_GM_STORE_INSTRUCTIONS
524           * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
525           * Notes:
526           *   - FIXME: disabled due to lack of SP counter capacity
527           *   - Equation: 4*sum(PERF_SP_{LM,GM}_{LOAD,STORE}_INSTRUCTIONS) / PERF_SP_BUSY_CYCLES
528           */
529          return 42;
530       }
531    );
532    counter("Bytes Data Actually Written", Counter::Units::Byte, [=]() {
533          /* Number of bytes requested to be written by the GPU. */
534          /* Countables:
535           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_18 = PERF_UCHE_EVICTS
536           * Notes:
537           *   - Equation: PERF_UCHE_EVICTS * 64
538           */
539          return PERF_UCHE_EVICTS * 64;
540       }
541    );
542    counter("Bytes Data Write Requested", Counter::Units::Byte, [=]() {
543          /* Number of bytes requested to be written by the GPU. */
544          /* Countables:
545           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_15 = PERF_UCHE_WRITE_REQUESTS_SP
546           * Notes:
547           *   - Equation: PERF_UCHE_WRITE_REQUESTS_SP * 16
548           */
549          return PERF_UCHE_WRITE_REQUESTS_SP * 16;
550       }
551    );
552    counter("Global Buffer Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() {
553          /* Number of bytes of global buffer data read in by the GPU, per second from the system memory (when the data is not found in L2 cache). */
554          /* Countables:
555           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP
556           * Notes:
557           *   - Equation: (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time
558           */
559          return (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time;
560       }
561    );
562    counter("Global Buffer Data Read Request BW (Bytes/sec)", Counter::Units::Byte, [=]() {
563          /* Number of bytes of global buffer read requests, made by a compute kernel to the L2 cache, per second. */
564          /* Countables:
565           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_13 = PERF_UCHE_READ_REQUESTS_SP
566           * Notes:
567           *   - Equation: (PERF_UCHE_READ_REQUESTS_SP * 16) / time
568           */
569          return (PERF_UCHE_READ_REQUESTS_SP * 16) / time;
570       }
571    );
572    counter("% Global Buffer Read L2 Hit", Counter::Units::Percent, [=]() {
573          /* Percentage of total global buffer read requests that were fulfilled by L2 cache hit which is populated by looking at the number of read requests that were forwarded to VBIF to read from the system memory. */
574          /* Countables:
575           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP
576           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_13 = PERF_UCHE_READ_REQUESTS_SP
577           * Notes:
578           *   - Equation: (PERF_UCHE_READ_REQUESTS_SP - (PERF_UCHE_VBIF_READ_BEATS_SP / 2)) / PERF_UCHE_READ_REQUESTS_SP
579           */
580          return percent(PERF_UCHE_READ_REQUESTS_SP - (PERF_UCHE_VBIF_READ_BEATS_SP / 2), PERF_UCHE_READ_REQUESTS_SP);
581       }
582    );
583    counter("% Global Buffer Write L2 Hit", Counter::Units::Percent, [=]() {
584          /* Percentage of global write L2 Hit. */
585          /* Countables:
586           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_18 = PERF_UCHE_EVICTS
587           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_15 = PERF_UCHE_WRITE_REQUESTS_SP
588           * Notes:
589           *   - Equation: (PERF_UCHE_WRITE_REQUESTS_SP - PERF_UCHE_EVICTS) / PERF_UCHE_WRITE_REQUESTS_SP
590           */
591          return percent(PERF_UCHE_WRITE_REQUESTS_SP - PERF_UCHE_EVICTS, PERF_UCHE_WRITE_REQUESTS_SP);
592       }
593    );
594    counter("Global Image Compressed Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() {
595          /* Number of bytes of global Image data (compressed) read in by the GPU per second from the system memory (when the data is not found in L2 cache). */
596          /* Countables:
597           * PERFCOUNTER_GROUP_CMP::COUNTABLE_7 = PERF_CMPDECMP_VBIF_READ_DATA
598           * Notes:
599           *   - Equation: (PERF_CMPDECMP_VBIF_READ_DATA * 32) / time
600           */
601          return (PERF_CMPDECMP_VBIF_READ_DATA * 32) / time;
602       }
603    );
604    counter("Global Image Data Read Request BW (Bytes/sec)", Counter::Units::Byte, [=]() {
605          /* Number of bytes of image buffer read requests, made by a compute kernel to the L2 cache, per second. */
606          /* Countables:
607           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP
608           * Notes:
609           *   - Equation: (PERF_UCHE_READ_REQUESTS_TP * 16) / time
610           */
611          return (PERF_UCHE_READ_REQUESTS_TP * 16) / time;
612       }
613    );
614    counter("Global Image Uncompressed Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() {
615          /* Number of bytes of global Image data (uncompressed) read in by the GPU per second from the system memory (when the data is not found in L2 cache). */
616          /* Countables:
617           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
618           * Notes:
619           *   - Equation: (PERF_UCHE_VBIF_READ_BEATS_TP * 32) / time
620           */
621          return (PERF_UCHE_VBIF_READ_BEATS_TP * 32) / time;
622       }
623    );
624    disabledCounter("Global Memory Atomic Instructions", Counter::Units::None, [=]() {
625          /* Number of Global Memory Atomic Instructions executed by SP during a given sample period. */
626          /* Countables:
627           * PERFCOUNTER_GROUP_SP::COUNTABLE_32 = PERF_SP_GM_ATOMICS
628           * Notes:
629           *   - FIXME: disabled due to lack of SP counter capacity
630           *   - Equation: PERF_SP_GM_ATOMICS * 4
631           */
632          return 42;
633       }
634    );
635    disabledCounter("Global Memory Load Instructions", Counter::Units::None, [=]() {
636          /* Number of Global Memory Load Instructions executed by SP during a given sample period. */
637          /* Countables:
638           * PERFCOUNTER_GROUP_SP::COUNTABLE_30 = PERF_SP_GM_LOAD_INSTRUCTIONS
639           * Notes:
640           *   - FIXME: disabled due to lack of SP counter capacity
641           *   - Equation: PERF_SP_GM_LOAD_INSTRUCTIONS * 4
642           */
643          return 42;
644       }
645    );
646    disabledCounter("Global Memory Store Instructions", Counter::Units::None, [=]() {
647          /* Number of Global Memory Store Instructions executed by SP during a given sample period. */
648          /* Countables:
649           * PERFCOUNTER_GROUP_SP::COUNTABLE_31 = PERF_SP_GM_STORE_INSTRUCTIONS
650           * Notes:
651           *   - FIXME: disabled due to lack of SP counter capacity
652           *   - Equation: PERF_SP_GM_STORE_INSTRUCTIONS * 4
653           */
654          return 42;
655       }
656    );
657    counter("% Image Read L2 Hit", Counter::Units::Percent, [=]() {
658          /* Percentage of total image read requests that were fulfilled by L2 cache hit which is populated by looking at the number of read requests that were forwarded to VBIF to read from the system memory. */
659          /* Countables:
660           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
661           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP
662           * Notes:
663           *   - Equation: (PERF_UCHE_READ_REQUESTS_TP - (PERF_UCHE_VBIF_READ_BEATS_TP / 2)) / PERF_UCHE_READ_REQUESTS_TP
664           */
665          return percent(PERF_UCHE_READ_REQUESTS_TP - (PERF_UCHE_VBIF_READ_BEATS_TP / 2), PERF_UCHE_READ_REQUESTS_TP);
666       }
667    );
668    counter("% Kernel Load Cycles", Counter::Units::Percent, [=]() {
669          /* Percentage of cycles used for a compute kernel loading; excludes execution cycles. */
670          /* Countables:
671           * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT
672           * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
673           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
674           * Notes:
675           *   - Equation: (PERF_RBBM_STATUS_MASKED - (PERF_SP_BUSY_CYCLES * #uSPTP)) / PERF_CP_ALWAYS_COUNT
676           */
677          return percent(PERF_RBBM_STATUS_MASKED - (PERF_SP_BUSY_CYCLES * number_of_usptp), PERF_CP_ALWAYS_COUNT);
678       }
679    );
680    counter("% L1 Hit", Counter::Units::Percent, [=]() {
681          /* Percentage of L1 texture cache requests that were hits. */
682          /* Countables:
683           * PERFCOUNTER_GROUP_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS
684           * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
685           * Notes:
686           *   - Equation: (PERF_TP_L1_CACHELINE_REQUESTS - PERF_TP_L1_CACHELINE_MISSES) / PERF_TP_L1_CACHELINE_REQUESTS
687           */
688          return percent(PERF_TP_L1_CACHELINE_REQUESTS[BR] - PERF_TP_L1_CACHELINE_MISSES[BR], PERF_TP_L1_CACHELINE_REQUESTS[BR]);
689       }
690    );
691    disabledCounter("Load-Store Utilization", Counter::Units::Percent, [=]() {
692          /* Percentage of the Load-Store unit is utilized compared to theoretical Load/Store throughput. */
693          /* Countables:
694           * PERFCOUNTER_GROUP_SP::COUNTABLE_63 = PERF_SP_LOAD_CONTROL_WORKING_CYCLES
695           * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
696           * Notes:
697           *   - FIXME: disabled due to lack of SP counter capacity
698           *   - Equation: PERF_SP_LOAD_CONTROL_WORKING_CYCLES / PERF_SP_BUSY_CYCLES
699           */
700          return 42;
701       }
702    );
703    disabledCounter("Local Memory Atomic Instructions", Counter::Units::None, [=]() {
704          /* Number of Local Memory Atomic Instructions executed by SP during a given sample period. */
705          /* Countables:
706           * PERFCOUNTER_GROUP_SP::COUNTABLE_29 = PERF_SP_LM_ATOMICS
707           * Notes:
708           *   - FIXME: disabled due to lack of SP counter capacity
709           *   - Equation: PERF_SP_LM_ATOMICS * 4
710           */
711          return 42;
712       }
713    );
714    disabledCounter("Local Memory Load Instructions", Counter::Units::None, [=]() {
715          /* Number of Local Memory Load Instructions executed by SP during a given sample period. */
716          /* Countables:
717           * PERFCOUNTER_GROUP_SP::COUNTABLE_27 = PERF_SP_LM_LOAD_INSTRUCTIONS
718           * Notes:
719           *   - FIXME: disabled due to lack of SP counter capacity
720           *   - Equation: PERF_SP_LM_LOAD_INSTRUCTIONS * 4
721           */
722          return 42;
723       }
724    );
725    disabledCounter("Local Memory Store Instructions", Counter::Units::None, [=]() {
726          /* Number of Local Memory Store Instructions executed by SP during a given sample period. */
727          /* Countables:
728           * PERFCOUNTER_GROUP_SP::COUNTABLE_28 = PERF_SP_LM_STORE_INSTRUCTIONS
729           * Notes:
730           *   - FIXME: disabled due to lack of SP counter capacity
731           *   - Equation: PERF_SP_LM_STORE_INSTRUCTIONS * 4
732           */
733          return 42;
734       }
735    );
736 
737    /**
738     * GPU General
739     */
740    disabledCounter("Clocks / Second", Counter::Units::None, [=]() {
741          /* Number of GPU clocks per second. */
742          /* Countables:
743           * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT
744           * Notes:
745           *   - TODO: with Adaptive Clock Distribution, the measured values are much more varied
746           *     than the constant GPU frequency value we currently get, so this counter is disabled
747           *     for now in favor of the GPU Frequency counter below.
748           *   - Equation: PERF_CP_ALWAYS_COUNT / time
749           */
750          return 42;
751       }
752    );
753    disabledCounter("GPU % Bus Busy", Counter::Units::Percent, [=]() {
754          /* Approximate Percentage of time the GPU's bus to system memory is busy. */
755          /* Countables:
756           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
757           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_1 = PERF_UCHE_STALL_CYCLES_ARBITER
758           * PERFCOUNTER_GROUP_VBIF::COUNTABLE_34 = PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL
759           * PERFCOUNTER_GROUP_VBIF::COUNTABLE_35 = PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL
760           * PERFCOUNTER_GROUP_VBIF::COUNTABLE_46 = PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL
761           * PERFCOUNTER_GROUP_VBIF::COUNTABLE_47 = PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL
762           * Notes:
763           *   - TODO: requires VBIF perfcounter group exposure which isn't trivial because of
764           *     more complex way that those counters are enabled
765           *   - Equation: (PERF_UCHE_STALL_CYCLES_ARBITER + sum(PERF_GBIF_AXI{0,1}_{READ,WRITE}_DATA_BEATS_TOTAL)) / (4 * PERF_RBBM_STATUS_MASKED)
766           */
767          return 42;
768       }
769    );
770    counter("GPU Frequency", Counter::Units::None, [=]() {
771          /* Notes:
772           *   - TODO: Should read from (an equivalent of) /sys/class/kgsl/kgsl-3d0/gpuclk
773           *   - Same value can be retrieved through PERF_CP_ALWAYS_COUNT, until ACD enables adaptive
774           *     GPU frequencies that would be covered by the Clocks / Second counter above.
775           */
776          return PERF_CP_ALWAYS_COUNT / time;
777       }
778    );
779    disabledCounter("GPU Temperature", Counter::Units::None, [=]() {
780          /* TODO: Should read from (an equivalent of) /sys/class/kgsl/kgsl-3d0/temp */
781          return 42;
782       }
783    );
784    counter("GPU % Utilization", Counter::Units::Percent, [=]() {
785          /* Percentage utilization of the GPU. */
786          /* Countables:
787           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
788           */
789          return percent(PERF_RBBM_STATUS_MASKED, max_freq);
790       }
791    );
792 
793    /**
794     * GPU Memory Stats
795     */
796    counter("Avg Bytes / Fragment", Counter::Units::Byte, [=]() {
797          /* Average number of bytes transferred from main memory for each fragment. */
798          /* Countables:
799           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
800           * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
801           */
802          return safe_div(PERF_UCHE_VBIF_READ_BEATS_TP * 32, PERF_SP_PIXELS);
803       }
804    );
805    counter("Avg Bytes / Vertex", Counter::Units::Byte, [=]() {
806          /* Average number of bytes transferred from main memory for each vertex. */
807          /* Countables:
808           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_5 = PERF_UCHE_VBIF_READ_BEATS_VFD
809           * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
810           * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
811           */
812          return safe_div(PERF_UCHE_VBIF_READ_BEATS_VFD * 32, cbSum(PERF_PC_VS_INVOCATIONS));
813       }
814    );
815    disabledCounter("Read Total (Bytes/sec)", Counter::Units::Byte, [=]() {
816          /* Total number of bytes read by the GPU from memory, per second. */
817          /* Countables:
818           * PERFCOUNTER_GROUP_VBIF::COUNTABLE_34 = PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL
819           * PERFCOUNTER_GROUP_VBIF::COUNTABLE_35 = PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL
820           * Notes:
821           *   - TODO: requires VBIF perfcounter group exposure which isn't trivial because of
822           *     more complex way that those counters are enabled
823           *   - Equation: (PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL + PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL) * 32 / time
824           */
825          return 42;
826       }
827    );
828    counter("SP Memory Read (Bytes/sec)", Counter::Units::Byte, [=]() {
829          /* Bytes of data read from memory by the Shader Processors, per second. */
830          /* Countables:
831           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP
832           */
833          return (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time;
834       }
835    );
836    counter("Texture Memory Read BW (Bytes/sec)", Counter::Units::Byte, [=]() {
837          /* Bytes of texture data read from memory per second. */
838          /* Countables:
839           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
840           * PERFCOUNTER_GROUP_CMP::COUNTABLE_7 = PERF_CMPDECMP_VBIF_READ_DATA
841           */
842          return ((PERF_UCHE_VBIF_READ_BEATS_TP + PERF_CMPDECMP_VBIF_READ_DATA) * 32) / time;
843       }
844    );
845    counter("Vertex Memory Read (Bytes/sec)", Counter::Units::Byte, [=]() {
846          /* Bytes of vertex data read from memory per second. */
847          /* Countables:
848           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_5 = PERF_UCHE_VBIF_READ_BEATS_VFD
849           */
850          return (PERF_UCHE_VBIF_READ_BEATS_VFD * 32) / time;
851       }
852    );
853    disabledCounter("Write Total (Bytes/sec)", Counter::Units::Byte, [=]() {
854          /* Total number of bytes written by the GPU to memory, per second. */
855          /* Countables:
856           * PERFCOUNTER_GROUP_VBIF::COUNTABLE_46 = PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL
857           * PERFCOUNTER_GROUP_VBIF::COUNTABLE_47 = PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL
858           * Notes:
859           *   - TODO: requires VBIF perfcounter group exposure which isn't trivial because of
860           *     more complex way that those counters are enabled
861           *   - Equation: (PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL + PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL) * 32 / time
862           */
863          return 42;
864       }
865    );
866 
867    /**
868     * GPU Preemption
869     */
870    counter("Avg Preemption Delay", Counter::Units::None, [=]() {
871          /* Average time (us) from the preemption request to preemption start. */
872          /* Countables:
873           * PERFCOUNTER_GROUP_CP::COUNTABLE_4 = PERF_CP_PREEMPTION_REACTION_DELAY
874           * PERFCOUNTER_GROUP_CP::COUNTABLE_3 = PERF_CP_NUM_PREEMPTIONS
875           * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT
876           * Note:
877           *   - PERF_CP_NUM_PREEMPTIONS has to be divided by 2
878           */
879          if (!PERF_CP_ALWAYS_COUNT || !PERF_CP_NUM_PREEMPTIONS)
880             return 0.0;
881 
882          double clocks_per_us = (double)PERF_CP_ALWAYS_COUNT / (time * 1000000);
883          double delay_us = PERF_CP_PREEMPTION_REACTION_DELAY / clocks_per_us;
884          return delay_us / ((double)PERF_CP_NUM_PREEMPTIONS / 2);
885       }
886    );
887    counter("Preemptions / second", Counter::Units::None, [=]() {
888          /* The number of GPU preemptions that occurred, per second. */
889          /* Countables:
890           * PERFCOUNTER_GROUP_CP::COUNTABLE_3 = PERF_CP_NUM_PREEMPTIONS
891           * Note:
892           *   - PERF_CP_NUM_PREEMPTIONS has to be divided by 2
893           */
894          return PERF_CP_NUM_PREEMPTIONS / (2 * time);
895       }
896    );
897 
898    /**
899     * GPU Primitive Processing
900     */
901    counter("Average Polygon Area", Counter::Units::None, [=]() {
902          /* Average number of pixels per polygon. */
903          /* Countables:
904           * PERFCOUNTER_GROUP_TSE::COUNTABLE_14 = PERF_TSE_OUTPUT_VISIBLE_PRIM
905           * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
906           */
907          return safe_div(PERF_SP_PIXELS, PERF_TSE_OUTPUT_VISIBLE_PRIM);
908       }
909    );
910    counter("Average Vertices / Polygon", Counter::Units::None, [=]() {
911          /* Average number of vertices per polygon. */
912          /* Countables:
913           * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
914           * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
915           * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM
916           */
917          return safe_div(cbSum(PERF_PC_VS_INVOCATIONS), PERF_TSE_INPUT_PRIM);
918       }
919    );
920    counter("Pre-clipped Polygons / Second", Counter::Units::None, [=]() {
921          /* Number of polygons submitted to the GPU, per second, before any hardware clipping. */
922          /* Countables:
923           * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM
924           */
925          return PERF_TSE_INPUT_PRIM / time;
926       }
927    );
928    counter("% Prims Clipped", Counter::Units::Percent, [=]() {
929          /* Percentage of primitives clipped by the GPU (where new primitives are generated). */
930          /* Countables:
931           * PERFCOUNTER_GROUP_TSE::COUNTABLE_9 = PERF_TSE_CLIPPED_PRIM
932           * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM
933           */
934          return percent(PERF_TSE_CLIPPED_PRIM, PERF_TSE_INPUT_PRIM);
935       }
936    );
937    counter("% Prims Trivially Rejected", Counter::Units::Percent, [=]() {
938          /* Percentage of primitives that are trivially rejected. */
939          /* Countables:
940           * PERFCOUNTER_GROUP_TSE::COUNTABLE_8 = PERF_TSE_TRIVAL_REJ_PRIM
941           * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM
942           */
943          return percent(PERF_TSE_TRIVAL_REJ_PRIM, PERF_TSE_INPUT_PRIM);
944       }
945    );
946    counter("Reused Vertices / Second", Counter::Units::None, [=]() {
947          /* Number of vertices used from the post-transform vertex buffer cache, per second. */
948          /* Countables:
949           * PERFCOUNTER_GROUP_PC::COUNTABLE_19 = PERF_PC_VERTEX_HITS
950           * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_19 = PERF_PC_VERTEX_HITS
951           */
952          return cbSum(PERF_PC_VERTEX_HITS) / time;
953       }
954    );
955 
956    /**
957     * GPU Shader Processing
958     */
959    counter("ALU / Fragment", Counter::Units::None, [=]() {
960          /* Average number of scalar fragment shader ALU instructions issued per shaded fragment, expressed as full precision ALUs (2 mediump = 1 fullp). */
961          /* Countables:
962           * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS
963           * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS
964           * PERFCOUNTER_GROUP_SP::COUNTABLE_99 = PERF_SP_QUADS
965           * Notes:
966           *   - PERF_SP_PIXELS is used instead of PERF_SP_QUADS to avoid SP counter group overcapacity.
967           *   - PERF_SP_PIXELS ~ PERF_SP_QUADS * 4
968           *   - original equation uses unmultiplied QUADS as denominator, we use PIXELS ~ QUADS * 4
969           *     to match other per-fragment counters.
970           */
971          return safe_div(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2,
972             PERF_SP_PIXELS);
973       }
974    );
975    counter("ALU / Vertex", Counter::Units::None, [=]() {
976          /* Average number of vertex scalar shader ALU instructions issued per shaded vertex. */
977          /* Countables:
978           * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
979           * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
980           * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
981           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
982           * Notes:
983           *   - Numerator has to be multiplied by four.
984           *   - For some reason half-precision ALUs are not counted.
985           */
986          return safe_div(4 * cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS), cbSum(PERF_PC_VS_INVOCATIONS));
987       }
988    );
989    counter("% Anisotropic Filtered", Counter::Units::Percent, [=]() {
990          /* Percent of texels filtered using the 'Anisotropic' sampling method. */
991          /* Countables:
992           * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
993           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
994           * PERFCOUNTER_GROUP_TP::COUNTABLE_28 = PERF_TP_OUTPUT_PIXELS_ANISO
995           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_28 = PERF_TP_OUTPUT_PIXELS_ANISO
996           */
997          return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_ANISO), cbSum(PERF_TP_OUTPUT_PIXELS));
998       }
999    );
1000    counter("Average BVH Fetch Latency Cycles", Counter::Units::None, [=]() {
1001          /* The Average BVH Fetch Latency cycles is the latency counted from start of BVH query request till getting BVH Query result back. */
1002          /* Countables:
1003           * PERFCOUNTER_GROUP_SP::COUNTABLE_139 = PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES
1004           * PERFCOUNTER_GROUP_SP::COUNTABLE_140 = PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES
1005           * Notes:
1006           *   - TODO: provisional implementation, wasn't able to verify.
1007           */
1008          return safe_div(PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES, PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES);
1009       }
1010    );
1011    counter("EFU / Fragment", Counter::Units::None, [=]() {
1012          /* Average number of scalar fragment shader EFU instructions issued per shaded fragment. */
1013          /* Countables:
1014           * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS
1015           * PERFCOUNTER_GROUP_SP::COUNTABLE_99 = PERF_SP_QUADS
1016           * Notes:
1017           *   - PERF_SP_PIXELS is used instead of PERF_SP_QUADS to avoid SP counter group overcapacity.
1018           *   - PERF_SP_PIXELS ~ PERF_SP_QUADS * 4
1019           *   - original equation uses unmultiplied QUADS as denominator, we use PIXELS ~ QUADS * 4
1020           *     to match other per-fragment counters.
1021           */
1022          return safe_div(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, PERF_SP_PIXELS);
1023       }
1024    );
1025    counter("EFU / Vertex", Counter::Units::None, [=]() {
1026          /* Average number of scalar vertex shader EFU instructions issued per shaded vertex. */
1027          /* Countables:
1028           * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1029           * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1030           * PERFCOUNTER_GROUP_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS
1031           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS
1032           * Notes:
1033           *   - Numerator has to be multiplied by four.
1034           */
1035          return safe_div(4 * cbSum(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS), cbSum(PERF_PC_VS_INVOCATIONS));
1036       }
1037    );
1038    counter("Fragment ALU Instructions / Sec (Full)", Counter::Units::None, [=]() {
1039          /* Total number of full precision fragment shader instructions issued, per second. */
1040          /* Countables:
1041           * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS
1042           * Notes:
1043           *   - Numerator has to be multiplied by four.
1044           */
1045          return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS * 4) / time;
1046       }
1047    );
1048    counter("Fragment ALU Instructions / Sec (Half)", Counter::Units::None, [=]() {
1049          /* Total number of half precision Scalar fragment shader instructions issued, per second. */
1050          /* Countables:
1051           * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS
1052           * Notes:
1053           *   - Numerator has to be multiplied by four.
1054           */
1055          return (PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS * 4) / time;
1056       }
1057    );
1058    counter("Fragment EFU Instructions / Second", Counter::Units::None, [=]() {
1059          /* Total number of Scalar fragment shader Elementary Function Unit (EFU) instructions issued, per second. */
1060          /* Countables:
1061           * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS
1062           * Notes:
1063           *   - Numerator has to be multiplied by four.
1064           */
1065          return (PERF_SP_FS_STAGE_EFU_INSTRUCTIONS * 4) / time;
1066       }
1067    );
1068    counter("Fragment Instructions / Second", Counter::Units::None, [=]() {
1069          /* Total number of fragment shader instructions issued, per second. */
1070          /* Countables:
1071           * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS
1072           * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS
1073           * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS
1074           * Notes:
1075           *   - Numerator has to be multiplied by four.
1076           */
1077          return (4 * (PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
1078             + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2)) / time;
1079       }
1080    );
1081    counter("Fragments Shaded / Second", Counter::Units::None, [=]() {
1082          /* Number of fragments submitted to the shader engine, per second. */
1083          /* Countables:
1084           * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
1085           */
1086          return PERF_SP_PIXELS / time;
1087       }
1088    );
1089    counter("% Linear Filtered", Counter::Units::Percent, [=]() {
1090          /* Percent of texels filtered using the 'Linear' sampling method. */
1091          /* Countables:
1092           * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1093           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1094           * PERFCOUNTER_GROUP_TP::COUNTABLE_26 = PERF_TP_OUTPUT_PIXELS_BILINEAR
1095           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_26 = PERF_TP_OUTPUT_PIXELS_BILINEAR
1096           */
1097          return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_BILINEAR), cbSum(PERF_TP_OUTPUT_PIXELS));
1098       }
1099    );
1100    counter("% Nearest Filtered", Counter::Units::Percent, [=]() {
1101          /* Percent of texels filtered using the 'Nearest' sampling method. */
1102          /* Countables:
1103           * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1104           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1105           * PERFCOUNTER_GROUP_TP::COUNTABLE_25 = PERF_TP_OUTPUT_PIXELS_POINT
1106           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_25 = PERF_TP_OUTPUT_PIXELS_POINT
1107           */
1108          return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_POINT), cbSum(PERF_TP_OUTPUT_PIXELS));
1109       }
1110    );
1111    disabledCounter("% Non-Base Level Textures", Counter::Units::Percent, [=]() {
1112          /* Percent of texels coming from a non-base MIP level. */
1113          /* Countables:
1114           * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1115           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1116           * PERFCOUNTER_GROUP_TP::COUNTABLE_29 = PERF_TP_OUTPUT_PIXELS_ZERO_LOD
1117           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_29 = PERF_TP_OUTPUT_PIXELS_ZERO_LOD
1118           * Notes:
1119           *   - FIXME: disabled due to lack of TP counter capacity
1120           *   - Equation: 100.0 - percent(cbSum(PERF_TP_OUTPUT_PIXELS_ZERO_LOD), cbSum(PERF_TP_OUTPUT_PIXELS));
1121           */
1122          return 42;
1123       }
1124    );
1125    counter("% RTU Busy", Counter::Units::Percent, [=]() {
1126          /* Percentage of time that Ray Tracing Unit in SP is busy compared to whole SP. */
1127          /* Countables:
1128           * PERFCOUNTER_GROUP_SP::COUNTABLE_125 = PERF_SP_RTU_BUSY_CYCLES
1129           * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1130           * Notes:
1131           *   - TODO: provisional implementation, wasn't able to verify.
1132           */
1133          return percent(PERF_SP_RTU_BUSY_CYCLES, PERF_SP_BUSY_CYCLES);
1134       }
1135    );
1136    counter("RTU Ray Box Intersections Per Instruction", Counter::Units::None, [=]() {
1137          /* Number of Ray Box intersections per instruction. */
1138          /* Countables:
1139           * PERFCOUNTER_GROUP_SP::COUNTABLE_148 = PERF_SP_RTU_RAY_BOX_INTERSECTIONS
1140           * PERFCOUNTER_GROUP_SP::COUNTABLE_122 = PERF_SP_RAY_QUERY_INSTRUCTIONS
1141           * Notes:
1142           *   - TODO: provisional implementation, wasn't able to verify.
1143           */
1144          return safe_div(PERF_SP_RTU_RAY_BOX_INTERSECTIONS, PERF_SP_RAY_QUERY_INSTRUCTIONS);
1145       }
1146    );
1147    counter("RTU Ray Triangle Intersections Per Instruction", Counter::Units::None, [=]() {
1148          /* Number of Ray Triangle intersections per instruction. */
1149          /* Countables:
1150           * PERFCOUNTER_GROUP_SP::COUNTABLE_149 = PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS
1151           * PERFCOUNTER_GROUP_SP::COUNTABLE_122 = PERF_SP_RAY_QUERY_INSTRUCTIONS
1152           * Notes:
1153           *   - TODO: provisional implementation, wasn't able to verify.
1154           */
1155          return safe_div(PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS, PERF_SP_RAY_QUERY_INSTRUCTIONS);
1156       }
1157    );
1158    counter("% Shader ALU Capacity Utilized", Counter::Units::Percent, [=]() {
1159          /* Percent of maximum shader capacity (ALU operations) utilized. */
1160          /* Countables:
1161           * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1162           * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
1163           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
1164           * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS
1165           * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS
1166           * Notes:
1167           *   - Numerator has to be multiplied by four.
1168           */
1169          int64_t numerator = cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS) +
1170             PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2;
1171          int64_t denominator = PERF_SP_BUSY_CYCLES * number_of_alus_per_usptp;
1172          return percent(numerator, denominator);
1173       }
1174    );
1175    counter("% Shaders Busy", Counter::Units::Percent, [=]() {
1176          /* Percentage of time that all Shader cores are busy. */
1177          /* Countables:
1178           * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1179           * PERFCOUNTER_GROUP_TP::COUNTABLE_0 = PERF_TP_BUSY_CYCLES
1180           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1181           * Notes:
1182           *   - SP_BUSY_CYCLES seems to be used as the numerator -- unless it's zero,
1183           *     at which point TP_BUSY_CYLCES seems to be used instead.
1184           */
1185 
1186          int64_t numerator = PERF_SP_BUSY_CYCLES;
1187          if (!numerator)
1188             numerator = PERF_TP_BUSY_CYCLES;
1189          return percent(numerator, number_of_usptp * PERF_RBBM_STATUS_MASKED);
1190       }
1191    );
1192    counter("% Shaders Stalled", Counter::Units::Percent, [=]() {
1193          /* Percentage of time that all shader cores are idle with at least one active wave. */
1194          /* Countables:
1195           * PERFCOUNTER_GROUP_SP::COUNTABLE_7 = PERF_SP_NON_EXECUTION_CYCLES
1196           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1197           */
1198          return percent(PERF_SP_NON_EXECUTION_CYCLES, number_of_usptp * PERF_RBBM_STATUS_MASKED);
1199       }
1200    );
1201    counter("% Texture Pipes Busy", Counter::Units::Percent, [=]() {
1202          /* Percentage of time that any texture pipe is busy. */
1203          /* Countables:
1204           * PERFCOUNTER_GROUP_TP::COUNTABLE_0 = PERF_TP_BUSY_CYCLES
1205           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1206           */
1207          return percent(PERF_TP_BUSY_CYCLES, number_of_usptp * PERF_RBBM_STATUS_MASKED);
1208       }
1209    );
1210    counter("Textures / Fragment", Counter::Units::None, [=]() {
1211          /* Average number of textures referenced per fragment. */
1212          /* Countables:
1213           * PERFCOUNTER_GROUP_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS
1214           * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1215           * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
1216           */
1217          return safe_div(PERF_TP_OUTPUT_PIXELS[BR], PERF_SP_PIXELS);
1218       }
1219    );
1220    counter("Textures / Vertex", Counter::Units::None, [=]() {
1221          /* Average number of textures referenced per vertex. */
1222          /* Countables:
1223           * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1224           * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1225           * PERFCOUNTER_GROUP_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS
1226           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS
1227           * Notes:
1228           *   - Numerator has to be multiplied by four.
1229           */
1230          return safe_div(4 * cbSum(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS), cbSum(PERF_PC_VS_INVOCATIONS));
1231       }
1232    );
1233    counter("% Time ALUs Working", Counter::Units::Percent, [=]() {
1234          /* Percentage of time the ALUs are working while the Shaders are busy. */
1235          /* Countables:
1236           * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1237           * PERFCOUNTER_GROUP_SP::COUNTABLE_1 = PERF_SP_ALU_WORKING_CYCLES
1238           * Notes:
1239           *   - ALU working cycles have to be halved.
1240           */
1241          return percent(PERF_SP_ALU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES);
1242       }
1243    );
1244    counter("% Time Compute", Counter::Units::Percent, [=]() {
1245          /* Amount of time spent in compute work compared to the total time spent shading everything. */
1246          /* Countables:
1247           * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE
1248           * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1249           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1250           * PERFCOUNTER_GROUP_SP::COUNTABLE_78 = PERF_SP_ANY_EU_WORKING_CS_STAGE
1251           * CS_STAGE amount is also counted in FS_STAGE, so it shouldn't be summed into the total value.
1252           */
1253          int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE +
1254             cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE);
1255          return percent(PERF_SP_ANY_EU_WORKING_CS_STAGE, total);
1256       }
1257    );
1258    counter("% Time EFUs Working", Counter::Units::Percent, [=]() {
1259          /* Percentage of time the EFUs are working while the Shaders are busy. */
1260          /* Countables:
1261           * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1262           * PERFCOUNTER_GROUP_SP::COUNTABLE_2 = PERF_SP_EFU_WORKING_CYCLES
1263           */
1264          return percent(PERF_SP_EFU_WORKING_CYCLES, PERF_SP_BUSY_CYCLES);
1265       }
1266    );
1267    counter("% Time Shading Fragments", Counter::Units::Percent, [=]() {
1268          /* Amount of time spent shading fragments compared to the total time spent shading everything. */
1269          /* Countables:
1270           * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE
1271           * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1272           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1273           * PERFCOUNTER_GROUP_SP::COUNTABLE_78 = PERF_SP_ANY_EU_WORKING_CS_STAGE
1274           * Notes:
1275           *   - CS_STAGE amount is also counted in FS_STAGE, so fragment time has to be retrieved
1276           *     through subtraction and the compute time shouldn't be summed into the total value.
1277           */
1278          int64_t fragments = PERF_SP_ANY_EU_WORKING_FS_STAGE - PERF_SP_ANY_EU_WORKING_CS_STAGE;
1279          int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE +
1280             cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE);
1281          return percent(fragments, total);
1282       }
1283    );
1284    counter("% Time Shading Vertices", Counter::Units::Percent, [=]() {
1285          /* Amount of time spent shading vertices compared to the total time spent shading everything. */
1286          /* Countables:
1287           * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE
1288           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE
1289           * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1290           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1291           * Notes:
1292           *   - CS_STAGE amount is also counted in FS_STAGE, so it shouldn't be summed into the total value.
1293           */
1294          int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE +
1295             cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE);
1296          return percent(cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE), total);
1297       }
1298    );
1299    counter("Vertex Instructions / Second", Counter::Units::None, [=]() {
1300          /* Total number of scalar vertex shader instructions issued, per second. */
1301          /* Countables:
1302           * PERFCOUNTER_GROUP_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS
1303           * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
1304           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS
1305           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
1306           * Notes:
1307               - Numerator has to be multiplied by four.
1308           */
1309          return (4 * (cbSum(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS) + cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS))) / time;
1310       }
1311    );
1312    counter("Vertices Shaded / Second", Counter::Units::None, [=]() {
1313          /* Number of vertices submitted to the shader engine, per second. */
1314          /* Countables:
1315           * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1316           * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1317           */
1318          return cbSum(PERF_PC_VS_INVOCATIONS) / time;
1319       }
1320    );
1321    disabledCounter("% Wave Context Occupancy", Counter::Units::Percent, [=]() {
1322          /* Average percentage of wave context occupancy per cycle. */
1323          /* Countables:
1324           * PERFCOUNTER_GROUP_SP::COUNTABLE_8 = PERF_SP_WAVE_CONTEXTS
1325           * PERFCOUNTER_GROUP_SP::COUNTABLE_9 = PERF_SP_WAVE_CONTEXT_CYCLES
1326           * Note:
1327           *   - FIXME: disabled due to lack of SP counter capacity
1328           *   - the quotient has to be divided by the number of execution wave slots per SP (16 on a7xx)
1329           *   - Equation: (PERF_SP_WAVE_CONTEXTS / PERF_SP_WAVE_CONTEXT_CYCLES) / number_of_execution_wave_slots_per_sp;
1330           */
1331          return 42;
1332       }
1333    );
1334 
1335    /**
1336     * GPU Stalls
1337     */
1338    counter("% BVH Fetch Stall", Counter::Units::Percent, [=]() {
1339          /* Percentage of clock cycles where the RTU could not make any more requests for BVH fetch from scheduler. */
1340          /* Countables:
1341           * PERFCOUNTER_GROUP_SP::COUNTABLE_150 = PERF_SP_SCH_STALL_CYCLES_RTU
1342           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1343           * Notes:
1344           *   - TODO: provisional implementation, wasn't able to verify.
1345           */
1346          return percent(PERF_SP_SCH_STALL_CYCLES_RTU, PERF_RBBM_STATUS_MASKED);
1347       }
1348    );
1349    counter("% Instruction Cache Miss", Counter::Units::Percent, [=]() {
1350          /* Number of L1 instruction cache misses divided by L1 instruction cache requests. */
1351          /* Countables:
1352           * PERFCOUNTER_GROUP_SP::COUNTABLE_51 = PERF_SP_ICL1_REQUESTS
1353           * PERFCOUNTER_GROUP_SP::COUNTABLE_52 = PERF_SP_ICL1_MISSES
1354           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_51 = PERF_SP_ICL1_REQUESTS
1355           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_52 = PERF_SP_ICL1_MISSES
1356           */
1357          return percent(cbSum(PERF_SP_ICL1_MISSES), cbSum(PERF_SP_ICL1_REQUESTS));
1358       }
1359    );
1360    counter("L1 Texture Cache Miss Per Pixel", Counter::Units::None, [=]() {
1361          /* Average number of Texture L1 cache misses per pixel. */
1362          /* Countables:
1363           * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
1364           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
1365           * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
1366           */
1367          return safe_div(cbSum(PERF_TP_L1_CACHELINE_MISSES), PERF_SP_PIXELS);
1368       }
1369    );
1370    counter("% Stalled On System Memory", Counter::Units::Percent, [=]() {
1371          /* Percentage of cycles the L2 cache is stalled waiting for data from system memory. */
1372          /* Countables:
1373           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_1 = PERF_UCHE_STALL_CYCLES_ARBITER
1374           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1375           * Notes:
1376           *   - denominator has to be multiplied by four, for unknown reasons.
1377           */
1378          return safe_div(PERF_UCHE_STALL_CYCLES_ARBITER, 4 * PERF_RBBM_STATUS_MASKED);
1379       }
1380    );
1381    counter("% Texture Fetch Stall", Counter::Units::Percent, [=]() {
1382          /* Percentage of clock cycles where the shader processors cannot make any more requests for texture data. */
1383          /* Countables:
1384           * PERFCOUNTER_GROUP_SP::COUNTABLE_4 = PERF_SP_STALL_CYCLES_TP
1385           * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_4 = PERF_SP_STALL_CYCLES_TP
1386           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1387           */
1388          return percent(cbSum(PERF_SP_STALL_CYCLES_TP), number_of_usptp * PERF_RBBM_STATUS_MASKED);
1389       }
1390    );
1391    counter("% Texture L1 Miss", Counter::Units::Percent, [=]() {
1392          /* Number of L1 texture cache misses divided by L1 texture cache requests. */
1393          /* Countables:
1394           * PERFCOUNTER_GROUP_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS
1395           * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
1396           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS
1397           * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
1398           */
1399          return percent(cbSum(PERF_TP_L1_CACHELINE_MISSES), cbSum(PERF_TP_L1_CACHELINE_REQUESTS));
1400       }
1401    );
1402    counter("% Texture L2 Miss", Counter::Units::Percent, [=]() {
1403          /* Number of L2 texture cache misses divided by L2 texture cache requests. */
1404          /* Countables:
1405           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
1406           * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP
1407           * Notes:
1408           *   - ratio has to be multiplied by two. Unsure how this constant comes up.
1409           */
1410          return percent(2 * PERF_UCHE_VBIF_READ_BEATS_TP, PERF_UCHE_READ_REQUESTS_TP);
1411       }
1412    );
1413    counter("% Vertex Fetch Stall", Counter::Units::Percent, [=]() {
1414          /* Percentage of clock cycles where the GPU cannot make any more requests for vertex data. */
1415          /* Countables:
1416           * PERFCOUNTER_GROUP_PC::COUNTABLE_2 = PERF_PC_STALL_CYCLES_VFD
1417           * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_2 = PERF_PC_STALL_CYCLES_VFD
1418           * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1419           */
1420          return percent(cbSum(PERF_PC_STALL_CYCLES_VFD), PERF_RBBM_STATUS_MASKED);
1421       }
1422    );
1423 }
1424 
1425 /**
1426  * Generate an submit the cmdstream to configure the counter/countable
1427  * muxing
1428  */
1429 void
configure_counters(bool reset,bool wait)1430 FreedrenoDriver::configure_counters(bool reset, bool wait)
1431 {
1432    struct fd_submit *submit = fd_submit_new(pipe);
1433    enum fd_ringbuffer_flags flags =
1434       (enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);
1435    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0x1000, flags);
1436 
1437    for (const auto &countable : countables)
1438       countable.configure(ring, reset);
1439 
1440    struct fd_fence *fence = fd_submit_flush(submit, -1, false);
1441 
1442    fd_fence_flush(fence);
1443    fd_fence_del(fence);
1444 
1445    fd_ringbuffer_del(ring);
1446    fd_submit_del(submit);
1447 
1448    if (wait)
1449       fd_pipe_wait(pipe, fence);
1450 }
1451 
1452 /**
1453  * Read the current counter values and record the time.
1454  */
1455 void
collect_countables()1456 FreedrenoDriver::collect_countables()
1457 {
1458    last_dump_ts = perfetto::base::GetBootTimeNs().count();
1459 
1460    for (const auto &countable : countables)
1461       countable.collect();
1462 }
1463 
1464 bool
init_perfcnt()1465 FreedrenoDriver::init_perfcnt()
1466 {
1467    uint64_t val;
1468 
1469    if (dev)
1470       return true;
1471 
1472    dev = fd_device_new(drm_device.fd);
1473    pipe = fd_pipe_new2(dev, FD_PIPE_3D, 0);
1474    dev_id = fd_pipe_dev_id(pipe);
1475 
1476    if (fd_pipe_get_param(pipe, FD_MAX_FREQ, &val)) {
1477       PERFETTO_FATAL("Could not get MAX_FREQ");
1478       return false;
1479    }
1480    max_freq = val;
1481 
1482    if (fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val)) {
1483       PERFETTO_ILOG("Could not get SUSPEND_COUNT");
1484    } else {
1485       suspend_count = val;
1486       has_suspend_count = true;
1487    }
1488 
1489    fd_pipe_set_param(pipe, FD_SYSPROF, 1);
1490 
1491    perfcntrs = fd_perfcntrs(fd_pipe_dev_id(pipe), &num_perfcntrs);
1492    if (num_perfcntrs == 0) {
1493       PERFETTO_FATAL("No hw counters available");
1494       return false;
1495    }
1496 
1497    assigned_counters.resize(num_perfcntrs);
1498    assigned_counters.assign(assigned_counters.size(), 0);
1499 
1500    info = fd_dev_info_raw(dev_id);
1501 
1502    switch (fd_dev_gen(dev_id)) {
1503    case 6:
1504       setup_a6xx_counters();
1505       break;
1506    case 7:
1507       setup_a7xx_counters();
1508       break;
1509    default:
1510       PERFETTO_FATAL("Unsupported GPU: a%03u", fd_dev_gpu_id(dev_id));
1511       return false;
1512    }
1513 
1514    state.resize(next_countable_id);
1515 
1516    for (const auto &countable : countables)
1517       countable.resolve();
1518 
1519    io = fd_dt_find_io();
1520    if (!io) {
1521       PERFETTO_FATAL("Could not map GPU I/O space");
1522       return false;
1523    }
1524 
1525    configure_counters(true, true);
1526    collect_countables();
1527 
1528    return true;
1529 }
1530 
1531 void
enable_counter(const uint32_t counter_id)1532 FreedrenoDriver::enable_counter(const uint32_t counter_id)
1533 {
1534    enabled_counters.push_back(counters[counter_id]);
1535 }
1536 
1537 void
enable_all_counters()1538 FreedrenoDriver::enable_all_counters()
1539 {
1540    enabled_counters.reserve(counters.size());
1541    for (auto &counter : counters) {
1542       enabled_counters.push_back(counter);
1543    }
1544 }
1545 
1546 void
enable_perfcnt(const uint64_t)1547 FreedrenoDriver::enable_perfcnt(const uint64_t /* sampling_period_ns */)
1548 {
1549 }
1550 
1551 bool
dump_perfcnt()1552 FreedrenoDriver::dump_perfcnt()
1553 {
1554    if (has_suspend_count) {
1555       uint64_t val;
1556 
1557       fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val);
1558 
1559       if (suspend_count != val) {
1560          PERFETTO_ILOG("Device had suspended!");
1561 
1562          suspend_count = val;
1563 
1564          configure_counters(true, true);
1565          collect_countables();
1566 
1567          /* We aren't going to have anything sensible by comparing
1568           * current values to values from prior to the suspend, so
1569           * just skip this sampling period.
1570           */
1571          return false;
1572       }
1573    }
1574 
1575    auto last_ts = last_dump_ts;
1576 
1577    /* Capture the timestamp from the *start* of the sampling period: */
1578    last_capture_ts = last_dump_ts;
1579 
1580    collect_countables();
1581 
1582    auto elapsed_time_ns = last_dump_ts - last_ts;
1583 
1584    time = (float)elapsed_time_ns / 1000000000.0;
1585 
1586    /* On older kernels that dont' support querying the suspend-
1587     * count, just send configuration cmdstream regularly to keep
1588     * the GPU alive and correctly configured for the countables
1589     * we want
1590     */
1591    if (!has_suspend_count) {
1592       configure_counters(false, false);
1593    }
1594 
1595    return true;
1596 }
1597 
next()1598 uint64_t FreedrenoDriver::next()
1599 {
1600    auto ret = last_capture_ts;
1601    last_capture_ts = 0;
1602    return ret;
1603 }
1604 
disable_perfcnt()1605 void FreedrenoDriver::disable_perfcnt()
1606 {
1607    /* There isn't really any disable, only reconfiguring which countables
1608     * get muxed to which counters
1609     */
1610 }
1611 
1612 /*
1613  * Countable
1614  */
1615 
1616 FreedrenoDriver::Countable
countable(std::string group,std::string name)1617 FreedrenoDriver::countable(std::string group, std::string name)
1618 {
1619    auto countable = Countable(this, group, name);
1620    countables.emplace_back(countable);
1621    return countable;
1622 }
1623 
Countable(FreedrenoDriver * d,std::string group,std::string name)1624 FreedrenoDriver::Countable::Countable(FreedrenoDriver *d, std::string group, std::string name)
1625    : id {d->next_countable_id++}, d {d}, group {group}, name {name}
1626 {
1627 }
1628 
1629 /* Emit register writes on ring to configure counter/countable muxing: */
1630 void
configure(struct fd_ringbuffer * ring,bool reset) const1631 FreedrenoDriver::Countable::configure(struct fd_ringbuffer *ring, bool reset) const
1632 {
1633    const struct fd_perfcntr_countable *countable = d->state[id].countable;
1634    const struct fd_perfcntr_counter   *counter   = d->state[id].counter;
1635 
1636    OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
1637 
1638    if (counter->enable && reset) {
1639       OUT_PKT4(ring, counter->enable, 1);
1640       OUT_RING(ring, 0);
1641    }
1642 
1643    if (counter->clear && reset) {
1644       OUT_PKT4(ring, counter->clear, 1);
1645       OUT_RING(ring, 1);
1646 
1647       OUT_PKT4(ring, counter->clear, 1);
1648       OUT_RING(ring, 0);
1649    }
1650 
1651    OUT_PKT4(ring, counter->select_reg, 1);
1652    OUT_RING(ring, countable->selector);
1653 
1654    if (counter->enable && reset) {
1655       OUT_PKT4(ring, counter->enable, 1);
1656       OUT_RING(ring, 1);
1657    }
1658 }
1659 
1660 /* Collect current counter value and calculate delta since last sample: */
1661 void
collect() const1662 FreedrenoDriver::Countable::collect() const
1663 {
1664    const struct fd_perfcntr_counter *counter = d->state[id].counter;
1665 
1666    d->state[id].last_value = d->state[id].value;
1667 
1668    /* this is true on a5xx and later */
1669    assert(counter->counter_reg_lo + 1 == counter->counter_reg_hi);
1670    uint64_t *reg = (uint64_t *)((uint32_t *)d->io + counter->counter_reg_lo);
1671 
1672    d->state[id].value = *reg;
1673 }
1674 
1675 /* Resolve the countable and assign next counter from it's group: */
1676 void
resolve() const1677 FreedrenoDriver::Countable::resolve() const
1678 {
1679    for (unsigned i = 0; i < d->num_perfcntrs; i++) {
1680       const struct fd_perfcntr_group *g = &d->perfcntrs[i];
1681       if (group != g->name)
1682          continue;
1683 
1684       for (unsigned j = 0; j < g->num_countables; j++) {
1685          const struct fd_perfcntr_countable *c = &g->countables[j];
1686          if (name != c->name)
1687             continue;
1688 
1689          d->state[id].countable = c;
1690 
1691          /* Assign a counter from the same group: */
1692          assert(d->assigned_counters[i] < g->num_counters);
1693          d->state[id].counter = &g->counters[d->assigned_counters[i]++];
1694 
1695          std::cout << "Countable: " << name << ", group=" << g->name <<
1696                ", counter=" << d->assigned_counters[i] - 1 << "\n";
1697 
1698          return;
1699       }
1700    }
1701    unreachable("no such countable!");
1702 }
1703 
1704 uint64_t
get_value() const1705 FreedrenoDriver::Countable::get_value() const
1706 {
1707    return d->state[id].value - d->state[id].last_value;
1708 }
1709 
1710 /*
1711  * DerivedCounter
1712  */
1713 
DerivedCounter(FreedrenoDriver * d,std::string name,Counter::Units units,std::function<int64_t ()> derive)1714 FreedrenoDriver::DerivedCounter::DerivedCounter(FreedrenoDriver *d, std::string name,
1715                                                 Counter::Units units,
1716                                                 std::function<int64_t()> derive)
1717    : Counter(d->next_counter_id++, name, 0)
1718 {
1719    std::cout << "DerivedCounter: " << name << ", id=" << id << "\n";
1720    this->units = units;
1721    set_getter([=](const Counter &c, const Driver &d) {
1722          return derive();
1723       }
1724    );
1725 }
1726 
1727 FreedrenoDriver::DerivedCounter
counter(std::string name,Counter::Units units,std::function<int64_t ()> derive)1728 FreedrenoDriver::counter(std::string name, Counter::Units units,
1729                          std::function<int64_t()> derive)
1730 {
1731    auto counter = DerivedCounter(this, name, units, derive);
1732    counters.emplace_back(counter);
1733    return counter;
1734 }
1735 
1736 uint32_t
gpu_clock_id() const1737 FreedrenoDriver::gpu_clock_id() const
1738 {
1739    return perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
1740 }
1741 
1742 uint64_t
gpu_timestamp() const1743 FreedrenoDriver::gpu_timestamp() const
1744 {
1745    return perfetto::base::GetBootTimeNs().count();
1746 }
1747 
1748 bool
cpu_gpu_timestamp(uint64_t &,uint64_t &) const1749 FreedrenoDriver::cpu_gpu_timestamp(uint64_t &, uint64_t &) const
1750 {
1751    /* Not supported */
1752    return false;
1753 }
1754 
1755 } // namespace pps
1756