1 /*
2 * Copyright © 2021 Google, Inc.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "fd_pps_driver.h"
7
8 #include <cstring>
9 #include <iostream>
10 #include <perfetto.h>
11
12 #include "common/freedreno_dev_info.h"
13 #include "drm/freedreno_drmif.h"
14 #include "drm/freedreno_ringbuffer.h"
15 #include "perfcntrs/freedreno_dt.h"
16 #include "perfcntrs/freedreno_perfcntr.h"
17
18 #include "pps/pps.h"
19 #include "pps/pps_algorithm.h"
20
21 namespace pps
22 {
23
24 double
safe_div(uint64_t a,uint64_t b)25 safe_div(uint64_t a, uint64_t b)
26 {
27 if (b == 0)
28 return 0;
29
30 return a / static_cast<double>(b);
31 }
32
33 float
percent(uint64_t a,uint64_t b)34 percent(uint64_t a, uint64_t b)
35 {
36 /* Sometimes we get bogus values but we want for the timeline
37 * to look nice without higher than 100% values.
38 */
39 if (b == 0 || a > b)
40 return 0;
41
42 return 100.f * (a / static_cast<double>(b));
43 }
44
45 bool
is_dump_perfcnt_preemptible() const46 FreedrenoDriver::is_dump_perfcnt_preemptible() const
47 {
48 return false;
49 }
50
51 uint64_t
get_min_sampling_period_ns()52 FreedrenoDriver::get_min_sampling_period_ns()
53 {
54 return 100000;
55 }
56
57 /*
58 TODO this sees like it would be largely the same for a5xx as well
59 (ie. same countable names)..
60 */
61 void
setup_a6xx_counters()62 FreedrenoDriver::setup_a6xx_counters()
63 {
64 /* TODO is there a reason to want more than one group? */
65 CounterGroup group = {};
66 group.name = "counters";
67 groups.clear();
68 counters.clear();
69 countables.clear();
70 enabled_counters.clear();
71 groups.emplace_back(std::move(group));
72
73 /*
74 * Create the countables that we'll be using.
75 */
76
77 auto PERF_CP_ALWAYS_COUNT = countable("CP", "PERF_CP_ALWAYS_COUNT");
78 auto PERF_CP_BUSY_CYCLES = countable("CP", "PERF_CP_BUSY_CYCLES");
79 auto PERF_RB_3D_PIXELS = countable("RB", "PERF_RB_3D_PIXELS");
80 auto PERF_TP_L1_CACHELINE_MISSES = countable("TP", "PERF_TP_L1_CACHELINE_MISSES");
81 auto PERF_TP_L1_CACHELINE_REQUESTS = countable("TP", "PERF_TP_L1_CACHELINE_REQUESTS");
82
83 auto PERF_TP_OUTPUT_PIXELS = countable("TP", "PERF_TP_OUTPUT_PIXELS");
84 auto PERF_TP_OUTPUT_PIXELS_ANISO = countable("TP", "PERF_TP_OUTPUT_PIXELS_ANISO");
85 auto PERF_TP_OUTPUT_PIXELS_BILINEAR = countable("TP", "PERF_TP_OUTPUT_PIXELS_BILINEAR");
86 auto PERF_TP_OUTPUT_PIXELS_POINT = countable("TP", "PERF_TP_OUTPUT_PIXELS_POINT");
87 auto PERF_TP_OUTPUT_PIXELS_ZERO_LOD = countable("TP", "PERF_TP_OUTPUT_PIXELS_ZERO_LOD");
88
89 auto PERF_TSE_INPUT_PRIM = countable("TSE", "PERF_TSE_INPUT_PRIM");
90 auto PERF_TSE_CLIPPED_PRIM = countable("TSE", "PERF_TSE_CLIPPED_PRIM");
91 auto PERF_TSE_TRIVAL_REJ_PRIM = countable("TSE", "PERF_TSE_TRIVAL_REJ_PRIM");
92 auto PERF_TSE_OUTPUT_VISIBLE_PRIM = countable("TSE", "PERF_TSE_OUTPUT_VISIBLE_PRIM");
93
94 auto PERF_SP_BUSY_CYCLES = countable("SP", "PERF_SP_BUSY_CYCLES");
95 auto PERF_SP_ALU_WORKING_CYCLES = countable("SP", "PERF_SP_ALU_WORKING_CYCLES");
96 auto PERF_SP_EFU_WORKING_CYCLES = countable("SP", "PERF_SP_EFU_WORKING_CYCLES");
97 auto PERF_SP_VS_STAGE_EFU_INSTRUCTIONS = countable("SP", "PERF_SP_VS_STAGE_EFU_INSTRUCTIONS");
98 auto PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS");
99 auto PERF_SP_VS_STAGE_TEX_INSTRUCTIONS = countable("SP", "PERF_SP_VS_STAGE_TEX_INSTRUCTIONS");
100 auto PERF_SP_FS_STAGE_EFU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_EFU_INSTRUCTIONS");
101 auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS");
102 auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS");
103 auto PERF_SP_STALL_CYCLES_TP = countable("SP", "PERF_SP_STALL_CYCLES_TP");
104 auto PERF_SP_ANY_EU_WORKING_FS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_FS_STAGE");
105 auto PERF_SP_ANY_EU_WORKING_VS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_VS_STAGE");
106 auto PERF_SP_ANY_EU_WORKING_CS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_CS_STAGE");
107
108 auto PERF_UCHE_STALL_CYCLES_ARBITER = countable("UCHE", "PERF_UCHE_STALL_CYCLES_ARBITER");
109 auto PERF_UCHE_VBIF_READ_BEATS_TP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_TP");
110 auto PERF_UCHE_VBIF_READ_BEATS_VFD = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_VFD");
111 auto PERF_UCHE_VBIF_READ_BEATS_SP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_SP");
112 auto PERF_UCHE_READ_REQUESTS_TP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_TP");
113
114 auto PERF_PC_STALL_CYCLES_VFD = countable("PC", "PERF_PC_STALL_CYCLES_VFD");
115 auto PERF_PC_VS_INVOCATIONS = countable("PC", "PERF_PC_VS_INVOCATIONS");
116 auto PERF_PC_VERTEX_HITS = countable("PC", "PERF_PC_VERTEX_HITS");
117
118 auto PERF_HLSQ_QUADS = countable("HLSQ", "PERF_HLSQ_QUADS"); /* Quads (fragments / 4) produced */
119
120 auto PERF_CP_NUM_PREEMPTIONS = countable("CP", "PERF_CP_NUM_PREEMPTIONS");
121 auto PERF_CP_PREEMPTION_REACTION_DELAY = countable("CP", "PERF_CP_PREEMPTION_REACTION_DELAY");
122
123 /* TODO: resolve() tells there is no PERF_CMPDECMP_VBIF_READ_DATA */
124 // auto PERF_CMPDECMP_VBIF_READ_DATA = countable("PERF_CMPDECMP_VBIF_READ_DATA");
125
126 /*
127 * And then setup the derived counters that we are exporting to
128 * pps based on the captured countable values.
129 *
130 * We try to expose the same counters as blob:
131 * https://gpuinspector.dev/docs/gpu-counters/qualcomm
132 */
133
134 counter("GPU Frequency", Counter::Units::Hertz, [=]() {
135 return PERF_CP_ALWAYS_COUNT / time;
136 }
137 );
138
139 counter("GPU % Utilization", Counter::Units::Percent, [=]() {
140 return percent(PERF_CP_BUSY_CYCLES / time, max_freq);
141 }
142 );
143
144 counter("TP L1 Cache Misses", Counter::Units::None, [=]() {
145 return PERF_TP_L1_CACHELINE_MISSES / time;
146 }
147 );
148
149 counter("Shader Core Utilization", Counter::Units::Percent, [=]() {
150 return percent(PERF_SP_BUSY_CYCLES / time, max_freq * info->num_sp_cores);
151 }
152 );
153
154 /* TODO: verify */
155 counter("(?) % Texture Fetch Stall", Counter::Units::Percent, [=]() {
156 return percent(PERF_SP_STALL_CYCLES_TP / time, max_freq * info->num_sp_cores);
157 }
158 );
159
160 /* TODO: verify */
161 counter("(?) % Vertex Fetch Stall", Counter::Units::Percent, [=]() {
162 return percent(PERF_PC_STALL_CYCLES_VFD / time, max_freq * info->num_sp_cores);
163 }
164 );
165
166 counter("L1 Texture Cache Miss Per Pixel", Counter::Units::None, [=]() {
167 return safe_div(PERF_TP_L1_CACHELINE_MISSES, PERF_HLSQ_QUADS * 4);
168 }
169 );
170
171 counter("% Texture L1 Miss", Counter::Units::Percent, [=]() {
172 return percent(PERF_TP_L1_CACHELINE_MISSES, PERF_TP_L1_CACHELINE_REQUESTS);
173 }
174 );
175
176 counter("% Texture L2 Miss", Counter::Units::Percent, [=]() {
177 return percent(PERF_UCHE_VBIF_READ_BEATS_TP / 2, PERF_UCHE_READ_REQUESTS_TP);
178 }
179 );
180
181 /* TODO: verify */
182 counter("(?) % Stalled on System Memory", Counter::Units::Percent, [=]() {
183 return percent(PERF_UCHE_STALL_CYCLES_ARBITER / time, max_freq * info->num_sp_cores);
184 }
185 );
186
187 counter("Pre-clipped Polygons / Second", Counter::Units::None, [=]() {
188 return PERF_TSE_INPUT_PRIM * (1.f / time);
189 }
190 );
191
192 counter("% Prims Trivially Rejected", Counter::Units::Percent, [=]() {
193 return percent(PERF_TSE_TRIVAL_REJ_PRIM, PERF_TSE_INPUT_PRIM);
194 }
195 );
196
197 counter("% Prims Clipped", Counter::Units::Percent, [=]() {
198 return percent(PERF_TSE_CLIPPED_PRIM, PERF_TSE_INPUT_PRIM);
199 }
200 );
201
202 counter("Average Vertices / Polygon", Counter::Units::None, [=]() {
203 return PERF_PC_VS_INVOCATIONS / PERF_TSE_INPUT_PRIM;
204 }
205 );
206
207 counter("Reused Vertices / Second", Counter::Units::None, [=]() {
208 return PERF_PC_VERTEX_HITS * (1.f / time);
209 }
210 );
211
212 counter("Average Polygon Area", Counter::Units::None, [=]() {
213 return safe_div(PERF_HLSQ_QUADS * 4, PERF_TSE_OUTPUT_VISIBLE_PRIM);
214 }
215 );
216
217 /* TODO: find formula */
218 // counter("% Shaders Busy", Counter::Units::Percent, [=]() {
219 // return 100.0 * 0;
220 // }
221 // );
222
223 counter("Vertices Shaded / Second", Counter::Units::None, [=]() {
224 return PERF_PC_VS_INVOCATIONS * (1.f / time);
225 }
226 );
227
228 counter("Fragments Shaded / Second", Counter::Units::None, [=]() {
229 return PERF_HLSQ_QUADS * 4 * (1.f / time);
230 }
231 );
232
233 counter("Vertex Instructions / Second", Counter::Units::None, [=]() {
234 return (PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS +
235 PERF_SP_VS_STAGE_EFU_INSTRUCTIONS) * (1.f / time);
236 }
237 );
238
239 counter("Fragment Instructions / Second", Counter::Units::None, [=]() {
240 return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
241 PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2 +
242 PERF_SP_FS_STAGE_EFU_INSTRUCTIONS) * (1.f / time);
243 }
244 );
245
246 counter("Fragment ALU Instructions / Sec (Full)", Counter::Units::None, [=]() {
247 return PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS * (1.f / time);
248 }
249 );
250
251 counter("Fragment ALU Instructions / Sec (Half)", Counter::Units::None, [=]() {
252 return PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS * (1.f / time);
253 }
254 );
255
256 counter("Fragment EFU Instructions / Second", Counter::Units::None, [=]() {
257 return PERF_SP_FS_STAGE_EFU_INSTRUCTIONS * (1.f / time);
258 }
259 );
260
261 counter("Textures / Vertex", Counter::Units::None, [=]() {
262 return safe_div(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS);
263 }
264 );
265
266 counter("Textures / Fragment", Counter::Units::None, [=]() {
267 return safe_div(PERF_TP_OUTPUT_PIXELS, PERF_HLSQ_QUADS * 4);
268 }
269 );
270
271 counter("ALU / Vertex", Counter::Units::None, [=]() {
272 return safe_div(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS);
273 }
274 );
275
276 counter("EFU / Vertex", Counter::Units::None, [=]() {
277 return safe_div(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS);
278 }
279 );
280
281 counter("ALU / Fragment", Counter::Units::None, [=]() {
282 return safe_div(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
283 PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2, PERF_HLSQ_QUADS);
284 }
285 );
286
287 counter("EFU / Fragment", Counter::Units::None, [=]() {
288 return safe_div(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, PERF_HLSQ_QUADS);
289 }
290 );
291
292 counter("% Time Shading Vertices", Counter::Units::Percent, [=]() {
293 return percent(PERF_SP_ANY_EU_WORKING_VS_STAGE,
294 (PERF_SP_ANY_EU_WORKING_VS_STAGE +
295 PERF_SP_ANY_EU_WORKING_FS_STAGE +
296 PERF_SP_ANY_EU_WORKING_CS_STAGE));
297 }
298 );
299
300 counter("% Time Shading Fragments", Counter::Units::Percent, [=]() {
301 return percent(PERF_SP_ANY_EU_WORKING_FS_STAGE,
302 (PERF_SP_ANY_EU_WORKING_VS_STAGE +
303 PERF_SP_ANY_EU_WORKING_FS_STAGE +
304 PERF_SP_ANY_EU_WORKING_CS_STAGE));
305 }
306 );
307
308 counter("% Time Compute", Counter::Units::Percent, [=]() {
309 return percent(PERF_SP_ANY_EU_WORKING_CS_STAGE,
310 (PERF_SP_ANY_EU_WORKING_VS_STAGE +
311 PERF_SP_ANY_EU_WORKING_FS_STAGE +
312 PERF_SP_ANY_EU_WORKING_CS_STAGE));
313 }
314 );
315
316 counter("% Shader ALU Capacity Utilized", Counter::Units::Percent, [=]() {
317 return percent((PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS +
318 PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
319 PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2) / 64,
320 PERF_SP_BUSY_CYCLES);
321 }
322 );
323
324 counter("% Time ALUs Working", Counter::Units::Percent, [=]() {
325 return percent(PERF_SP_ALU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES);
326 }
327 );
328
329 counter("% Time EFUs Working", Counter::Units::Percent, [=]() {
330 return percent(PERF_SP_EFU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES);
331 }
332 );
333
334 counter("% Anisotropic Filtered", Counter::Units::Percent, [=]() {
335 return percent(PERF_TP_OUTPUT_PIXELS_ANISO, PERF_TP_OUTPUT_PIXELS);
336 }
337 );
338
339 counter("% Linear Filtered", Counter::Units::Percent, [=]() {
340 return percent(PERF_TP_OUTPUT_PIXELS_BILINEAR, PERF_TP_OUTPUT_PIXELS);
341 }
342 );
343
344 counter("% Nearest Filtered", Counter::Units::Percent, [=]() {
345 return percent(PERF_TP_OUTPUT_PIXELS_POINT, PERF_TP_OUTPUT_PIXELS);
346 }
347 );
348
349 counter("% Non-Base Level Textures", Counter::Units::Percent, [=]() {
350 return percent(PERF_TP_OUTPUT_PIXELS_ZERO_LOD, PERF_TP_OUTPUT_PIXELS);
351 }
352 );
353
354 /* Reads from KGSL_PERFCOUNTER_GROUP_VBIF countable=63 */
355 // counter("Read Total (Bytes/sec)", Counter::Units::Byte, [=]() {
356 // return * (1.f / time);
357 // }
358 // );
359
360 /* Reads from KGSL_PERFCOUNTER_GROUP_VBIF countable=84 */
361 // counter("Write Total (Bytes/sec)", Counter::Units::Byte, [=]() {
362 // return * (1.f / time);
363 // }
364 // );
365
366 /* Cannot get PERF_CMPDECMP_VBIF_READ_DATA countable */
367 // counter("Texture Memory Read BW (Bytes/Second)", Counter::Units::Byte, [=]() {
368 // return (PERF_CMPDECMP_VBIF_READ_DATA + PERF_UCHE_VBIF_READ_BEATS_TP) * (1.f / time);
369 // }
370 // );
371
372 /* TODO: verify */
373 counter("(?) Vertex Memory Read (Bytes/Second)", Counter::Units::Byte, [=]() {
374 return PERF_UCHE_VBIF_READ_BEATS_VFD * 32 * (1.f / time);
375 }
376 );
377
378 /* TODO: verify */
379 counter("SP Memory Read (Bytes/Second)", Counter::Units::Byte, [=]() {
380 return PERF_UCHE_VBIF_READ_BEATS_SP * 32 * (1.f / time);
381 }
382 );
383
384 counter("Avg Bytes / Fragment", Counter::Units::Byte, [=]() {
385 return safe_div(PERF_UCHE_VBIF_READ_BEATS_TP * 32, PERF_HLSQ_QUADS * 4);
386 }
387 );
388
389 counter("Avg Bytes / Vertex", Counter::Units::Byte, [=]() {
390 return safe_div(PERF_UCHE_VBIF_READ_BEATS_VFD * 32, PERF_PC_VS_INVOCATIONS);
391 }
392 );
393
394 counter("Preemptions / second", Counter::Units::None, [=]() {
395 return PERF_CP_NUM_PREEMPTIONS * (1.f / time);
396 }
397 );
398
399 counter("Avg Preemption Delay", Counter::Units::None, [=]() {
400 return PERF_CP_PREEMPTION_REACTION_DELAY * (1.f / time);
401 }
402 );
403 }
404
405 void
setup_a7xx_counters()406 FreedrenoDriver::setup_a7xx_counters()
407 {
408 /* TODO is there a reason to want more than one group? */
409 CounterGroup group = {};
410 group.name = "counters";
411 groups.clear();
412 counters.clear();
413 countables.clear();
414 enabled_counters.clear();
415 groups.emplace_back(std::move(group));
416
417 /* So far, all a7xx devices seem to have two uSPTPs in each SP core
418 * and 128 ALUs in each uSPTP.
419 */
420 const unsigned number_of_usptp = info->num_sp_cores * 2;
421 const unsigned number_of_alus_per_usptp = 128;
422
423 /* The enumeration and two helper lambdas serve to handle countables
424 * that can be sampled from either rendering or visibility bins.
425 */
426 enum {
427 BR = 0,
428 BV = 1,
429 };
430
431 auto cbCountable = [=](std::string group, std::string name) {
432 return std::array<Countable, 2> {
433 countable(group, name),
434 countable("BV_" + group, name),
435 };
436 };
437
438 auto cbSum = [](const std::array<Countable, 2>& countable) {
439 return countable[BR] + countable[BV];
440 };
441
442 /* This is a helper no-op lambda to handle known and understood counters
443 * that we can't currently implement for a variety of reasons.
444 */
445 auto disabledCounter = [](std::string, Counter::Units, std::function<int64_t()>) { };
446
447 /* CP: 3/14 counters */
448 auto PERF_CP_ALWAYS_COUNT = countable("CP", "PERF_CP_ALWAYS_COUNT");
449 auto PERF_CP_NUM_PREEMPTIONS = countable("CP", "PERF_CP_NUM_PREEMPTIONS");
450 auto PERF_CP_PREEMPTION_REACTION_DELAY = countable("CP", "PERF_CP_PREEMPTION_REACTION_DELAY");
451
452 /* RBBM: 1/4 counters */
453 auto PERF_RBBM_STATUS_MASKED = countable("RBBM", "PERF_RBBM_STATUS_MASKED");
454
455 /* PC: 3/8 counters, BV_PC: 3/8 counters */
456 auto PERF_PC_STALL_CYCLES_VFD = cbCountable("PC", "PERF_PC_STALL_CYCLES_VFD");
457 auto PERF_PC_VERTEX_HITS = cbCountable("PC", "PERF_PC_VERTEX_HITS");
458 auto PERF_PC_VS_INVOCATIONS = cbCountable("PC", "PERF_PC_VS_INVOCATIONS");
459
460 /* TSE: 4/8 counters */
461 auto PERF_TSE_INPUT_PRIM = countable("TSE", "PERF_TSE_INPUT_PRIM");
462 auto PERF_TSE_TRIVAL_REJ_PRIM = countable("TSE", "PERF_TSE_TRIVAL_REJ_PRIM");
463 auto PERF_TSE_CLIPPED_PRIM = countable("TSE", "PERF_TSE_CLIPPED_PRIM");
464 auto PERF_TSE_OUTPUT_VISIBLE_PRIM = countable("TSE", "PERF_TSE_OUTPUT_VISIBLE_PRIM");
465
466 /* UCHE: 8/12 counters */
467 auto PERF_UCHE_STALL_CYCLES_ARBITER = countable("UCHE", "PERF_UCHE_STALL_CYCLES_ARBITER");
468 auto PERF_UCHE_VBIF_READ_BEATS_TP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_TP");
469 auto PERF_UCHE_VBIF_READ_BEATS_VFD = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_VFD");
470 auto PERF_UCHE_VBIF_READ_BEATS_SP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_SP");
471 auto PERF_UCHE_READ_REQUESTS_TP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_TP");
472 auto PERF_UCHE_READ_REQUESTS_SP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_SP");
473 auto PERF_UCHE_WRITE_REQUESTS_SP = countable("UCHE", "PERF_UCHE_WRITE_REQUESTS_SP");
474 auto PERF_UCHE_EVICTS = countable("UCHE", "PERF_UCHE_EVICTS");
475
476 /* TP: 7/12 counters, BV_TP: 6/6 counters */
477 auto PERF_TP_BUSY_CYCLES = countable("TP", "PERF_TP_BUSY_CYCLES");
478 auto PERF_TP_L1_CACHELINE_REQUESTS = cbCountable("TP", "PERF_TP_L1_CACHELINE_REQUESTS");
479 auto PERF_TP_L1_CACHELINE_MISSES = cbCountable("TP", "PERF_TP_L1_CACHELINE_MISSES");
480 auto PERF_TP_OUTPUT_PIXELS = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS");
481 auto PERF_TP_OUTPUT_PIXELS_POINT = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_POINT");
482 auto PERF_TP_OUTPUT_PIXELS_BILINEAR = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_BILINEAR");
483 auto PERF_TP_OUTPUT_PIXELS_ANISO = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_ANISO");
484
485 /* SP: 24/24 counters, BV_SP: 7/12 counters */
486 auto PERF_SP_BUSY_CYCLES = countable("SP", "PERF_SP_BUSY_CYCLES");
487 auto PERF_SP_ALU_WORKING_CYCLES = countable("SP", "PERF_SP_ALU_WORKING_CYCLES");
488 auto PERF_SP_EFU_WORKING_CYCLES = countable("SP", "PERF_SP_EFU_WORKING_CYCLES");
489 auto PERF_SP_STALL_CYCLES_TP = cbCountable("SP", "PERF_SP_STALL_CYCLES_TP");
490 auto PERF_SP_NON_EXECUTION_CYCLES = countable("SP", "PERF_SP_NON_EXECUTION_CYCLES");
491 auto PERF_SP_VS_STAGE_TEX_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_TEX_INSTRUCTIONS");
492 auto PERF_SP_VS_STAGE_EFU_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_EFU_INSTRUCTIONS");
493 auto PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS");
494 auto PERF_SP_FS_STAGE_EFU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_EFU_INSTRUCTIONS");
495 auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS");
496 auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS");
497 auto PERF_SP_ICL1_REQUESTS = cbCountable("SP", "PERF_SP_ICL1_REQUESTS");
498 auto PERF_SP_ICL1_MISSES = cbCountable("SP", "PERF_SP_ICL1_MISSES");
499 auto PERF_SP_ANY_EU_WORKING_FS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_FS_STAGE");
500 auto PERF_SP_ANY_EU_WORKING_VS_STAGE = cbCountable("SP", "PERF_SP_ANY_EU_WORKING_VS_STAGE");
501 auto PERF_SP_ANY_EU_WORKING_CS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_CS_STAGE");
502 auto PERF_SP_PIXELS = countable("SP", "PERF_SP_PIXELS");
503 auto PERF_SP_RAY_QUERY_INSTRUCTIONS = countable("SP", "PERF_SP_RAY_QUERY_INSTRUCTIONS");
504 auto PERF_SP_RTU_BUSY_CYCLES = countable("SP", "PERF_SP_RTU_BUSY_CYCLES");
505 auto PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES = countable("SP", "PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES");
506 auto PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES = countable("SP", "PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES");
507 auto PERF_SP_RTU_RAY_BOX_INTERSECTIONS = countable("SP", "PERF_SP_RTU_RAY_BOX_INTERSECTIONS");
508 auto PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS = countable("SP", "PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS");
509 auto PERF_SP_SCH_STALL_CYCLES_RTU = countable("SP", "PERF_SP_SCH_STALL_CYCLES_RTU");
510
511 /* CMP: 1/4 counters */
512 auto PERF_CMPDECMP_VBIF_READ_DATA = countable("CMP", "PERF_CMPDECMP_VBIF_READ_DATA");
513
514 /**
515 * GPU Compute
516 */
517 disabledCounter("Avg Load-Store Instructions Per Cycle", Counter::Units::None, [=]() {
518 /* Number of average Load-Store instructions per cycle. */
519 /* Countables:
520 * PERFCOUNTER_GROUP_SP::COUNTABLE_27 = PERF_SP_LM_LOAD_INSTRUCTIONS
521 * PERFCOUNTER_GROUP_SP::COUNTABLE_28 = PERF_SP_LM_STORE_INSTRUCTIONS
522 * PERFCOUNTER_GROUP_SP::COUNTABLE_30 = PERF_SP_GM_LOAD_INSTRUCTIONS
523 * PERFCOUNTER_GROUP_SP::COUNTABLE_31 = PERF_SP_GM_STORE_INSTRUCTIONS
524 * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
525 * Notes:
526 * - FIXME: disabled due to lack of SP counter capacity
527 * - Equation: 4*sum(PERF_SP_{LM,GM}_{LOAD,STORE}_INSTRUCTIONS) / PERF_SP_BUSY_CYCLES
528 */
529 return 42;
530 }
531 );
532 counter("Bytes Data Actually Written", Counter::Units::Byte, [=]() {
533 /* Number of bytes requested to be written by the GPU. */
534 /* Countables:
535 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_18 = PERF_UCHE_EVICTS
536 * Notes:
537 * - Equation: PERF_UCHE_EVICTS * 64
538 */
539 return PERF_UCHE_EVICTS * 64;
540 }
541 );
542 counter("Bytes Data Write Requested", Counter::Units::Byte, [=]() {
543 /* Number of bytes requested to be written by the GPU. */
544 /* Countables:
545 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_15 = PERF_UCHE_WRITE_REQUESTS_SP
546 * Notes:
547 * - Equation: PERF_UCHE_WRITE_REQUESTS_SP * 16
548 */
549 return PERF_UCHE_WRITE_REQUESTS_SP * 16;
550 }
551 );
552 counter("Global Buffer Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() {
553 /* Number of bytes of global buffer data read in by the GPU, per second from the system memory (when the data is not found in L2 cache). */
554 /* Countables:
555 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP
556 * Notes:
557 * - Equation: (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time
558 */
559 return (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time;
560 }
561 );
562 counter("Global Buffer Data Read Request BW (Bytes/sec)", Counter::Units::Byte, [=]() {
563 /* Number of bytes of global buffer read requests, made by a compute kernel to the L2 cache, per second. */
564 /* Countables:
565 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_13 = PERF_UCHE_READ_REQUESTS_SP
566 * Notes:
567 * - Equation: (PERF_UCHE_READ_REQUESTS_SP * 16) / time
568 */
569 return (PERF_UCHE_READ_REQUESTS_SP * 16) / time;
570 }
571 );
572 counter("% Global Buffer Read L2 Hit", Counter::Units::Percent, [=]() {
573 /* Percentage of total global buffer read requests that were fulfilled by L2 cache hit which is populated by looking at the number of read requests that were forwarded to VBIF to read from the system memory. */
574 /* Countables:
575 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP
576 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_13 = PERF_UCHE_READ_REQUESTS_SP
577 * Notes:
578 * - Equation: (PERF_UCHE_READ_REQUESTS_SP - (PERF_UCHE_VBIF_READ_BEATS_SP / 2)) / PERF_UCHE_READ_REQUESTS_SP
579 */
580 return percent(PERF_UCHE_READ_REQUESTS_SP - (PERF_UCHE_VBIF_READ_BEATS_SP / 2), PERF_UCHE_READ_REQUESTS_SP);
581 }
582 );
583 counter("% Global Buffer Write L2 Hit", Counter::Units::Percent, [=]() {
584 /* Percentage of global write L2 Hit. */
585 /* Countables:
586 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_18 = PERF_UCHE_EVICTS
587 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_15 = PERF_UCHE_WRITE_REQUESTS_SP
588 * Notes:
589 * - Equation: (PERF_UCHE_WRITE_REQUESTS_SP - PERF_UCHE_EVICTS) / PERF_UCHE_WRITE_REQUESTS_SP
590 */
591 return percent(PERF_UCHE_WRITE_REQUESTS_SP - PERF_UCHE_EVICTS, PERF_UCHE_WRITE_REQUESTS_SP);
592 }
593 );
594 counter("Global Image Compressed Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() {
595 /* Number of bytes of global Image data (compressed) read in by the GPU per second from the system memory (when the data is not found in L2 cache). */
596 /* Countables:
597 * PERFCOUNTER_GROUP_CMP::COUNTABLE_7 = PERF_CMPDECMP_VBIF_READ_DATA
598 * Notes:
599 * - Equation: (PERF_CMPDECMP_VBIF_READ_DATA * 32) / time
600 */
601 return (PERF_CMPDECMP_VBIF_READ_DATA * 32) / time;
602 }
603 );
604 counter("Global Image Data Read Request BW (Bytes/sec)", Counter::Units::Byte, [=]() {
605 /* Number of bytes of image buffer read requests, made by a compute kernel to the L2 cache, per second. */
606 /* Countables:
607 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP
608 * Notes:
609 * - Equation: (PERF_UCHE_READ_REQUESTS_TP * 16) / time
610 */
611 return (PERF_UCHE_READ_REQUESTS_TP * 16) / time;
612 }
613 );
614 counter("Global Image Uncompressed Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() {
615 /* Number of bytes of global Image data (uncompressed) read in by the GPU per second from the system memory (when the data is not found in L2 cache). */
616 /* Countables:
617 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
618 * Notes:
619 * - Equation: (PERF_UCHE_VBIF_READ_BEATS_TP * 32) / time
620 */
621 return (PERF_UCHE_VBIF_READ_BEATS_TP * 32) / time;
622 }
623 );
624 disabledCounter("Global Memory Atomic Instructions", Counter::Units::None, [=]() {
625 /* Number of Global Memory Atomic Instructions executed by SP during a given sample period. */
626 /* Countables:
627 * PERFCOUNTER_GROUP_SP::COUNTABLE_32 = PERF_SP_GM_ATOMICS
628 * Notes:
629 * - FIXME: disabled due to lack of SP counter capacity
630 * - Equation: PERF_SP_GM_ATOMICS * 4
631 */
632 return 42;
633 }
634 );
635 disabledCounter("Global Memory Load Instructions", Counter::Units::None, [=]() {
636 /* Number of Global Memory Load Instructions executed by SP during a given sample period. */
637 /* Countables:
638 * PERFCOUNTER_GROUP_SP::COUNTABLE_30 = PERF_SP_GM_LOAD_INSTRUCTIONS
639 * Notes:
640 * - FIXME: disabled due to lack of SP counter capacity
641 * - Equation: PERF_SP_GM_LOAD_INSTRUCTIONS * 4
642 */
643 return 42;
644 }
645 );
646 disabledCounter("Global Memory Store Instructions", Counter::Units::None, [=]() {
647 /* Number of Global Memory Store Instructions executed by SP during a given sample period. */
648 /* Countables:
649 * PERFCOUNTER_GROUP_SP::COUNTABLE_31 = PERF_SP_GM_STORE_INSTRUCTIONS
650 * Notes:
651 * - FIXME: disabled due to lack of SP counter capacity
652 * - Equation: PERF_SP_GM_STORE_INSTRUCTIONS * 4
653 */
654 return 42;
655 }
656 );
657 counter("% Image Read L2 Hit", Counter::Units::Percent, [=]() {
658 /* Percentage of total image read requests that were fulfilled by L2 cache hit which is populated by looking at the number of read requests that were forwarded to VBIF to read from the system memory. */
659 /* Countables:
660 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
661 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP
662 * Notes:
663 * - Equation: (PERF_UCHE_READ_REQUESTS_TP - (PERF_UCHE_VBIF_READ_BEATS_TP / 2)) / PERF_UCHE_READ_REQUESTS_TP
664 */
665 return percent(PERF_UCHE_READ_REQUESTS_TP - (PERF_UCHE_VBIF_READ_BEATS_TP / 2), PERF_UCHE_READ_REQUESTS_TP);
666 }
667 );
668 counter("% Kernel Load Cycles", Counter::Units::Percent, [=]() {
669 /* Percentage of cycles used for a compute kernel loading; excludes execution cycles. */
670 /* Countables:
671 * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT
672 * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
673 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
674 * Notes:
675 * - Equation: (PERF_RBBM_STATUS_MASKED - (PERF_SP_BUSY_CYCLES * #uSPTP)) / PERF_CP_ALWAYS_COUNT
676 */
677 return percent(PERF_RBBM_STATUS_MASKED - (PERF_SP_BUSY_CYCLES * number_of_usptp), PERF_CP_ALWAYS_COUNT);
678 }
679 );
680 counter("% L1 Hit", Counter::Units::Percent, [=]() {
681 /* Percentage of L1 texture cache requests that were hits. */
682 /* Countables:
683 * PERFCOUNTER_GROUP_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS
684 * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
685 * Notes:
686 * - Equation: (PERF_TP_L1_CACHELINE_REQUESTS - PERF_TP_L1_CACHELINE_MISSES) / PERF_TP_L1_CACHELINE_REQUESTS
687 */
688 return percent(PERF_TP_L1_CACHELINE_REQUESTS[BR] - PERF_TP_L1_CACHELINE_MISSES[BR], PERF_TP_L1_CACHELINE_REQUESTS[BR]);
689 }
690 );
691 disabledCounter("Load-Store Utilization", Counter::Units::Percent, [=]() {
692 /* Percentage of the Load-Store unit is utilized compared to theoretical Load/Store throughput. */
693 /* Countables:
694 * PERFCOUNTER_GROUP_SP::COUNTABLE_63 = PERF_SP_LOAD_CONTROL_WORKING_CYCLES
695 * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
696 * Notes:
697 * - FIXME: disabled due to lack of SP counter capacity
698 * - Equation: PERF_SP_LOAD_CONTROL_WORKING_CYCLES / PERF_SP_BUSY_CYCLES
699 */
700 return 42;
701 }
702 );
703 disabledCounter("Local Memory Atomic Instructions", Counter::Units::None, [=]() {
704 /* Number of Local Memory Atomic Instructions executed by SP during a given sample period. */
705 /* Countables:
706 * PERFCOUNTER_GROUP_SP::COUNTABLE_29 = PERF_SP_LM_ATOMICS
707 * Notes:
708 * - FIXME: disabled due to lack of SP counter capacity
709 * - Equation: PERF_SP_LM_ATOMICS * 4
710 */
711 return 42;
712 }
713 );
714 disabledCounter("Local Memory Load Instructions", Counter::Units::None, [=]() {
715 /* Number of Local Memory Load Instructions executed by SP during a given sample period. */
716 /* Countables:
717 * PERFCOUNTER_GROUP_SP::COUNTABLE_27 = PERF_SP_LM_LOAD_INSTRUCTIONS
718 * Notes:
719 * - FIXME: disabled due to lack of SP counter capacity
720 * - Equation: PERF_SP_LM_LOAD_INSTRUCTIONS * 4
721 */
722 return 42;
723 }
724 );
725 disabledCounter("Local Memory Store Instructions", Counter::Units::None, [=]() {
726 /* Number of Local Memory Store Instructions executed by SP during a given sample period. */
727 /* Countables:
728 * PERFCOUNTER_GROUP_SP::COUNTABLE_28 = PERF_SP_LM_STORE_INSTRUCTIONS
729 * Notes:
730 * - FIXME: disabled due to lack of SP counter capacity
731 * - Equation: PERF_SP_LM_STORE_INSTRUCTIONS * 4
732 */
733 return 42;
734 }
735 );
736
737 /**
738 * GPU General
739 */
740 disabledCounter("Clocks / Second", Counter::Units::None, [=]() {
741 /* Number of GPU clocks per second. */
742 /* Countables:
743 * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT
744 * Notes:
745 * - TODO: with Adaptive Clock Distribution, the measured values are much more varied
746 * than the constant GPU frequency value we currently get, so this counter is disabled
747 * for now in favor of the GPU Frequency counter below.
748 * - Equation: PERF_CP_ALWAYS_COUNT / time
749 */
750 return 42;
751 }
752 );
753 disabledCounter("GPU % Bus Busy", Counter::Units::Percent, [=]() {
754 /* Approximate Percentage of time the GPU's bus to system memory is busy. */
755 /* Countables:
756 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
757 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_1 = PERF_UCHE_STALL_CYCLES_ARBITER
758 * PERFCOUNTER_GROUP_VBIF::COUNTABLE_34 = PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL
759 * PERFCOUNTER_GROUP_VBIF::COUNTABLE_35 = PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL
760 * PERFCOUNTER_GROUP_VBIF::COUNTABLE_46 = PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL
761 * PERFCOUNTER_GROUP_VBIF::COUNTABLE_47 = PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL
762 * Notes:
763 * - TODO: requires VBIF perfcounter group exposure which isn't trivial because of
764 * more complex way that those counters are enabled
765 * - Equation: (PERF_UCHE_STALL_CYCLES_ARBITER + sum(PERF_GBIF_AXI{0,1}_{READ,WRITE}_DATA_BEATS_TOTAL)) / (4 * PERF_RBBM_STATUS_MASKED)
766 */
767 return 42;
768 }
769 );
770 counter("GPU Frequency", Counter::Units::None, [=]() {
771 /* Notes:
772 * - TODO: Should read from (an equivalent of) /sys/class/kgsl/kgsl-3d0/gpuclk
773 * - Same value can be retrieved through PERF_CP_ALWAYS_COUNT, until ACD enables adaptive
774 * GPU frequencies that would be covered by the Clocks / Second counter above.
775 */
776 return PERF_CP_ALWAYS_COUNT / time;
777 }
778 );
779 disabledCounter("GPU Temperature", Counter::Units::None, [=]() {
780 /* TODO: Should read from (an equivalent of) /sys/class/kgsl/kgsl-3d0/temp */
781 return 42;
782 }
783 );
784 counter("GPU % Utilization", Counter::Units::Percent, [=]() {
785 /* Percentage utilization of the GPU. */
786 /* Countables:
787 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
788 */
789 return percent(PERF_RBBM_STATUS_MASKED, max_freq);
790 }
791 );
792
793 /**
794 * GPU Memory Stats
795 */
796 counter("Avg Bytes / Fragment", Counter::Units::Byte, [=]() {
797 /* Average number of bytes transferred from main memory for each fragment. */
798 /* Countables:
799 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
800 * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
801 */
802 return safe_div(PERF_UCHE_VBIF_READ_BEATS_TP * 32, PERF_SP_PIXELS);
803 }
804 );
805 counter("Avg Bytes / Vertex", Counter::Units::Byte, [=]() {
806 /* Average number of bytes transferred from main memory for each vertex. */
807 /* Countables:
808 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_5 = PERF_UCHE_VBIF_READ_BEATS_VFD
809 * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
810 * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
811 */
812 return safe_div(PERF_UCHE_VBIF_READ_BEATS_VFD * 32, cbSum(PERF_PC_VS_INVOCATIONS));
813 }
814 );
815 disabledCounter("Read Total (Bytes/sec)", Counter::Units::Byte, [=]() {
816 /* Total number of bytes read by the GPU from memory, per second. */
817 /* Countables:
818 * PERFCOUNTER_GROUP_VBIF::COUNTABLE_34 = PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL
819 * PERFCOUNTER_GROUP_VBIF::COUNTABLE_35 = PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL
820 * Notes:
821 * - TODO: requires VBIF perfcounter group exposure which isn't trivial because of
822 * more complex way that those counters are enabled
823 * - Equation: (PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL + PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL) * 32 / time
824 */
825 return 42;
826 }
827 );
828 counter("SP Memory Read (Bytes/sec)", Counter::Units::Byte, [=]() {
829 /* Bytes of data read from memory by the Shader Processors, per second. */
830 /* Countables:
831 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP
832 */
833 return (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time;
834 }
835 );
836 counter("Texture Memory Read BW (Bytes/sec)", Counter::Units::Byte, [=]() {
837 /* Bytes of texture data read from memory per second. */
838 /* Countables:
839 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
840 * PERFCOUNTER_GROUP_CMP::COUNTABLE_7 = PERF_CMPDECMP_VBIF_READ_DATA
841 */
842 return ((PERF_UCHE_VBIF_READ_BEATS_TP + PERF_CMPDECMP_VBIF_READ_DATA) * 32) / time;
843 }
844 );
845 counter("Vertex Memory Read (Bytes/sec)", Counter::Units::Byte, [=]() {
846 /* Bytes of vertex data read from memory per second. */
847 /* Countables:
848 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_5 = PERF_UCHE_VBIF_READ_BEATS_VFD
849 */
850 return (PERF_UCHE_VBIF_READ_BEATS_VFD * 32) / time;
851 }
852 );
853 disabledCounter("Write Total (Bytes/sec)", Counter::Units::Byte, [=]() {
854 /* Total number of bytes written by the GPU to memory, per second. */
855 /* Countables:
856 * PERFCOUNTER_GROUP_VBIF::COUNTABLE_46 = PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL
857 * PERFCOUNTER_GROUP_VBIF::COUNTABLE_47 = PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL
858 * Notes:
859 * - TODO: requires VBIF perfcounter group exposure which isn't trivial because of
860 * more complex way that those counters are enabled
861 * - Equation: (PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL + PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL) * 32 / time
862 */
863 return 42;
864 }
865 );
866
867 /**
868 * GPU Preemption
869 */
870 counter("Avg Preemption Delay", Counter::Units::None, [=]() {
871 /* Average time (us) from the preemption request to preemption start. */
872 /* Countables:
873 * PERFCOUNTER_GROUP_CP::COUNTABLE_4 = PERF_CP_PREEMPTION_REACTION_DELAY
874 * PERFCOUNTER_GROUP_CP::COUNTABLE_3 = PERF_CP_NUM_PREEMPTIONS
875 * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT
876 * Note:
877 * - PERF_CP_NUM_PREEMPTIONS has to be divided by 2
878 */
879 if (!PERF_CP_ALWAYS_COUNT || !PERF_CP_NUM_PREEMPTIONS)
880 return 0.0;
881
882 double clocks_per_us = (double)PERF_CP_ALWAYS_COUNT / (time * 1000000);
883 double delay_us = PERF_CP_PREEMPTION_REACTION_DELAY / clocks_per_us;
884 return delay_us / ((double)PERF_CP_NUM_PREEMPTIONS / 2);
885 }
886 );
887 counter("Preemptions / second", Counter::Units::None, [=]() {
888 /* The number of GPU preemptions that occurred, per second. */
889 /* Countables:
890 * PERFCOUNTER_GROUP_CP::COUNTABLE_3 = PERF_CP_NUM_PREEMPTIONS
891 * Note:
892 * - PERF_CP_NUM_PREEMPTIONS has to be divided by 2
893 */
894 return PERF_CP_NUM_PREEMPTIONS / (2 * time);
895 }
896 );
897
898 /**
899 * GPU Primitive Processing
900 */
901 counter("Average Polygon Area", Counter::Units::None, [=]() {
902 /* Average number of pixels per polygon. */
903 /* Countables:
904 * PERFCOUNTER_GROUP_TSE::COUNTABLE_14 = PERF_TSE_OUTPUT_VISIBLE_PRIM
905 * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
906 */
907 return safe_div(PERF_SP_PIXELS, PERF_TSE_OUTPUT_VISIBLE_PRIM);
908 }
909 );
910 counter("Average Vertices / Polygon", Counter::Units::None, [=]() {
911 /* Average number of vertices per polygon. */
912 /* Countables:
913 * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
914 * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
915 * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM
916 */
917 return safe_div(cbSum(PERF_PC_VS_INVOCATIONS), PERF_TSE_INPUT_PRIM);
918 }
919 );
920 counter("Pre-clipped Polygons / Second", Counter::Units::None, [=]() {
921 /* Number of polygons submitted to the GPU, per second, before any hardware clipping. */
922 /* Countables:
923 * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM
924 */
925 return PERF_TSE_INPUT_PRIM / time;
926 }
927 );
928 counter("% Prims Clipped", Counter::Units::Percent, [=]() {
929 /* Percentage of primitives clipped by the GPU (where new primitives are generated). */
930 /* Countables:
931 * PERFCOUNTER_GROUP_TSE::COUNTABLE_9 = PERF_TSE_CLIPPED_PRIM
932 * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM
933 */
934 return percent(PERF_TSE_CLIPPED_PRIM, PERF_TSE_INPUT_PRIM);
935 }
936 );
937 counter("% Prims Trivially Rejected", Counter::Units::Percent, [=]() {
938 /* Percentage of primitives that are trivially rejected. */
939 /* Countables:
940 * PERFCOUNTER_GROUP_TSE::COUNTABLE_8 = PERF_TSE_TRIVAL_REJ_PRIM
941 * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM
942 */
943 return percent(PERF_TSE_TRIVAL_REJ_PRIM, PERF_TSE_INPUT_PRIM);
944 }
945 );
946 counter("Reused Vertices / Second", Counter::Units::None, [=]() {
947 /* Number of vertices used from the post-transform vertex buffer cache, per second. */
948 /* Countables:
949 * PERFCOUNTER_GROUP_PC::COUNTABLE_19 = PERF_PC_VERTEX_HITS
950 * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_19 = PERF_PC_VERTEX_HITS
951 */
952 return cbSum(PERF_PC_VERTEX_HITS) / time;
953 }
954 );
955
956 /**
957 * GPU Shader Processing
958 */
959 counter("ALU / Fragment", Counter::Units::None, [=]() {
960 /* Average number of scalar fragment shader ALU instructions issued per shaded fragment, expressed as full precision ALUs (2 mediump = 1 fullp). */
961 /* Countables:
962 * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS
963 * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS
964 * PERFCOUNTER_GROUP_SP::COUNTABLE_99 = PERF_SP_QUADS
965 * Notes:
966 * - PERF_SP_PIXELS is used instead of PERF_SP_QUADS to avoid SP counter group overcapacity.
967 * - PERF_SP_PIXELS ~ PERF_SP_QUADS * 4
968 * - original equation uses unmultiplied QUADS as denominator, we use PIXELS ~ QUADS * 4
969 * to match other per-fragment counters.
970 */
971 return safe_div(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2,
972 PERF_SP_PIXELS);
973 }
974 );
975 counter("ALU / Vertex", Counter::Units::None, [=]() {
976 /* Average number of vertex scalar shader ALU instructions issued per shaded vertex. */
977 /* Countables:
978 * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
979 * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
980 * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
981 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
982 * Notes:
983 * - Numerator has to be multiplied by four.
984 * - For some reason half-precision ALUs are not counted.
985 */
986 return safe_div(4 * cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS), cbSum(PERF_PC_VS_INVOCATIONS));
987 }
988 );
989 counter("% Anisotropic Filtered", Counter::Units::Percent, [=]() {
990 /* Percent of texels filtered using the 'Anisotropic' sampling method. */
991 /* Countables:
992 * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
993 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
994 * PERFCOUNTER_GROUP_TP::COUNTABLE_28 = PERF_TP_OUTPUT_PIXELS_ANISO
995 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_28 = PERF_TP_OUTPUT_PIXELS_ANISO
996 */
997 return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_ANISO), cbSum(PERF_TP_OUTPUT_PIXELS));
998 }
999 );
1000 counter("Average BVH Fetch Latency Cycles", Counter::Units::None, [=]() {
1001 /* The Average BVH Fetch Latency cycles is the latency counted from start of BVH query request till getting BVH Query result back. */
1002 /* Countables:
1003 * PERFCOUNTER_GROUP_SP::COUNTABLE_139 = PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES
1004 * PERFCOUNTER_GROUP_SP::COUNTABLE_140 = PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES
1005 * Notes:
1006 * - TODO: provisional implementation, wasn't able to verify.
1007 */
1008 return safe_div(PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES, PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES);
1009 }
1010 );
1011 counter("EFU / Fragment", Counter::Units::None, [=]() {
1012 /* Average number of scalar fragment shader EFU instructions issued per shaded fragment. */
1013 /* Countables:
1014 * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS
1015 * PERFCOUNTER_GROUP_SP::COUNTABLE_99 = PERF_SP_QUADS
1016 * Notes:
1017 * - PERF_SP_PIXELS is used instead of PERF_SP_QUADS to avoid SP counter group overcapacity.
1018 * - PERF_SP_PIXELS ~ PERF_SP_QUADS * 4
1019 * - original equation uses unmultiplied QUADS as denominator, we use PIXELS ~ QUADS * 4
1020 * to match other per-fragment counters.
1021 */
1022 return safe_div(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, PERF_SP_PIXELS);
1023 }
1024 );
1025 counter("EFU / Vertex", Counter::Units::None, [=]() {
1026 /* Average number of scalar vertex shader EFU instructions issued per shaded vertex. */
1027 /* Countables:
1028 * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1029 * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1030 * PERFCOUNTER_GROUP_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS
1031 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS
1032 * Notes:
1033 * - Numerator has to be multiplied by four.
1034 */
1035 return safe_div(4 * cbSum(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS), cbSum(PERF_PC_VS_INVOCATIONS));
1036 }
1037 );
1038 counter("Fragment ALU Instructions / Sec (Full)", Counter::Units::None, [=]() {
1039 /* Total number of full precision fragment shader instructions issued, per second. */
1040 /* Countables:
1041 * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS
1042 * Notes:
1043 * - Numerator has to be multiplied by four.
1044 */
1045 return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS * 4) / time;
1046 }
1047 );
1048 counter("Fragment ALU Instructions / Sec (Half)", Counter::Units::None, [=]() {
1049 /* Total number of half precision Scalar fragment shader instructions issued, per second. */
1050 /* Countables:
1051 * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS
1052 * Notes:
1053 * - Numerator has to be multiplied by four.
1054 */
1055 return (PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS * 4) / time;
1056 }
1057 );
1058 counter("Fragment EFU Instructions / Second", Counter::Units::None, [=]() {
1059 /* Total number of Scalar fragment shader Elementary Function Unit (EFU) instructions issued, per second. */
1060 /* Countables:
1061 * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS
1062 * Notes:
1063 * - Numerator has to be multiplied by four.
1064 */
1065 return (PERF_SP_FS_STAGE_EFU_INSTRUCTIONS * 4) / time;
1066 }
1067 );
1068 counter("Fragment Instructions / Second", Counter::Units::None, [=]() {
1069 /* Total number of fragment shader instructions issued, per second. */
1070 /* Countables:
1071 * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS
1072 * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS
1073 * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS
1074 * Notes:
1075 * - Numerator has to be multiplied by four.
1076 */
1077 return (4 * (PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
1078 + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2)) / time;
1079 }
1080 );
1081 counter("Fragments Shaded / Second", Counter::Units::None, [=]() {
1082 /* Number of fragments submitted to the shader engine, per second. */
1083 /* Countables:
1084 * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
1085 */
1086 return PERF_SP_PIXELS / time;
1087 }
1088 );
1089 counter("% Linear Filtered", Counter::Units::Percent, [=]() {
1090 /* Percent of texels filtered using the 'Linear' sampling method. */
1091 /* Countables:
1092 * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1093 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1094 * PERFCOUNTER_GROUP_TP::COUNTABLE_26 = PERF_TP_OUTPUT_PIXELS_BILINEAR
1095 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_26 = PERF_TP_OUTPUT_PIXELS_BILINEAR
1096 */
1097 return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_BILINEAR), cbSum(PERF_TP_OUTPUT_PIXELS));
1098 }
1099 );
1100 counter("% Nearest Filtered", Counter::Units::Percent, [=]() {
1101 /* Percent of texels filtered using the 'Nearest' sampling method. */
1102 /* Countables:
1103 * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1104 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1105 * PERFCOUNTER_GROUP_TP::COUNTABLE_25 = PERF_TP_OUTPUT_PIXELS_POINT
1106 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_25 = PERF_TP_OUTPUT_PIXELS_POINT
1107 */
1108 return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_POINT), cbSum(PERF_TP_OUTPUT_PIXELS));
1109 }
1110 );
1111 disabledCounter("% Non-Base Level Textures", Counter::Units::Percent, [=]() {
1112 /* Percent of texels coming from a non-base MIP level. */
1113 /* Countables:
1114 * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1115 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1116 * PERFCOUNTER_GROUP_TP::COUNTABLE_29 = PERF_TP_OUTPUT_PIXELS_ZERO_LOD
1117 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_29 = PERF_TP_OUTPUT_PIXELS_ZERO_LOD
1118 * Notes:
1119 * - FIXME: disabled due to lack of TP counter capacity
1120 * - Equation: 100.0 - percent(cbSum(PERF_TP_OUTPUT_PIXELS_ZERO_LOD), cbSum(PERF_TP_OUTPUT_PIXELS));
1121 */
1122 return 42;
1123 }
1124 );
1125 counter("% RTU Busy", Counter::Units::Percent, [=]() {
1126 /* Percentage of time that Ray Tracing Unit in SP is busy compared to whole SP. */
1127 /* Countables:
1128 * PERFCOUNTER_GROUP_SP::COUNTABLE_125 = PERF_SP_RTU_BUSY_CYCLES
1129 * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1130 * Notes:
1131 * - TODO: provisional implementation, wasn't able to verify.
1132 */
1133 return percent(PERF_SP_RTU_BUSY_CYCLES, PERF_SP_BUSY_CYCLES);
1134 }
1135 );
1136 counter("RTU Ray Box Intersections Per Instruction", Counter::Units::None, [=]() {
1137 /* Number of Ray Box intersections per instruction. */
1138 /* Countables:
1139 * PERFCOUNTER_GROUP_SP::COUNTABLE_148 = PERF_SP_RTU_RAY_BOX_INTERSECTIONS
1140 * PERFCOUNTER_GROUP_SP::COUNTABLE_122 = PERF_SP_RAY_QUERY_INSTRUCTIONS
1141 * Notes:
1142 * - TODO: provisional implementation, wasn't able to verify.
1143 */
1144 return safe_div(PERF_SP_RTU_RAY_BOX_INTERSECTIONS, PERF_SP_RAY_QUERY_INSTRUCTIONS);
1145 }
1146 );
1147 counter("RTU Ray Triangle Intersections Per Instruction", Counter::Units::None, [=]() {
1148 /* Number of Ray Triangle intersections per instruction. */
1149 /* Countables:
1150 * PERFCOUNTER_GROUP_SP::COUNTABLE_149 = PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS
1151 * PERFCOUNTER_GROUP_SP::COUNTABLE_122 = PERF_SP_RAY_QUERY_INSTRUCTIONS
1152 * Notes:
1153 * - TODO: provisional implementation, wasn't able to verify.
1154 */
1155 return safe_div(PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS, PERF_SP_RAY_QUERY_INSTRUCTIONS);
1156 }
1157 );
1158 counter("% Shader ALU Capacity Utilized", Counter::Units::Percent, [=]() {
1159 /* Percent of maximum shader capacity (ALU operations) utilized. */
1160 /* Countables:
1161 * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1162 * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
1163 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
1164 * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS
1165 * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS
1166 * Notes:
1167 * - Numerator has to be multiplied by four.
1168 */
1169 int64_t numerator = cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS) +
1170 PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2;
1171 int64_t denominator = PERF_SP_BUSY_CYCLES * number_of_alus_per_usptp;
1172 return percent(numerator, denominator);
1173 }
1174 );
1175 counter("% Shaders Busy", Counter::Units::Percent, [=]() {
1176 /* Percentage of time that all Shader cores are busy. */
1177 /* Countables:
1178 * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1179 * PERFCOUNTER_GROUP_TP::COUNTABLE_0 = PERF_TP_BUSY_CYCLES
1180 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1181 * Notes:
1182 * - SP_BUSY_CYCLES seems to be used as the numerator -- unless it's zero,
1183 * at which point TP_BUSY_CYLCES seems to be used instead.
1184 */
1185
1186 int64_t numerator = PERF_SP_BUSY_CYCLES;
1187 if (!numerator)
1188 numerator = PERF_TP_BUSY_CYCLES;
1189 return percent(numerator, number_of_usptp * PERF_RBBM_STATUS_MASKED);
1190 }
1191 );
1192 counter("% Shaders Stalled", Counter::Units::Percent, [=]() {
1193 /* Percentage of time that all shader cores are idle with at least one active wave. */
1194 /* Countables:
1195 * PERFCOUNTER_GROUP_SP::COUNTABLE_7 = PERF_SP_NON_EXECUTION_CYCLES
1196 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1197 */
1198 return percent(PERF_SP_NON_EXECUTION_CYCLES, number_of_usptp * PERF_RBBM_STATUS_MASKED);
1199 }
1200 );
1201 counter("% Texture Pipes Busy", Counter::Units::Percent, [=]() {
1202 /* Percentage of time that any texture pipe is busy. */
1203 /* Countables:
1204 * PERFCOUNTER_GROUP_TP::COUNTABLE_0 = PERF_TP_BUSY_CYCLES
1205 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1206 */
1207 return percent(PERF_TP_BUSY_CYCLES, number_of_usptp * PERF_RBBM_STATUS_MASKED);
1208 }
1209 );
1210 counter("Textures / Fragment", Counter::Units::None, [=]() {
1211 /* Average number of textures referenced per fragment. */
1212 /* Countables:
1213 * PERFCOUNTER_GROUP_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS
1214 * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS
1215 * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
1216 */
1217 return safe_div(PERF_TP_OUTPUT_PIXELS[BR], PERF_SP_PIXELS);
1218 }
1219 );
1220 counter("Textures / Vertex", Counter::Units::None, [=]() {
1221 /* Average number of textures referenced per vertex. */
1222 /* Countables:
1223 * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1224 * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1225 * PERFCOUNTER_GROUP_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS
1226 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS
1227 * Notes:
1228 * - Numerator has to be multiplied by four.
1229 */
1230 return safe_div(4 * cbSum(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS), cbSum(PERF_PC_VS_INVOCATIONS));
1231 }
1232 );
1233 counter("% Time ALUs Working", Counter::Units::Percent, [=]() {
1234 /* Percentage of time the ALUs are working while the Shaders are busy. */
1235 /* Countables:
1236 * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1237 * PERFCOUNTER_GROUP_SP::COUNTABLE_1 = PERF_SP_ALU_WORKING_CYCLES
1238 * Notes:
1239 * - ALU working cycles have to be halved.
1240 */
1241 return percent(PERF_SP_ALU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES);
1242 }
1243 );
1244 counter("% Time Compute", Counter::Units::Percent, [=]() {
1245 /* Amount of time spent in compute work compared to the total time spent shading everything. */
1246 /* Countables:
1247 * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE
1248 * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1249 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1250 * PERFCOUNTER_GROUP_SP::COUNTABLE_78 = PERF_SP_ANY_EU_WORKING_CS_STAGE
1251 * CS_STAGE amount is also counted in FS_STAGE, so it shouldn't be summed into the total value.
1252 */
1253 int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE +
1254 cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE);
1255 return percent(PERF_SP_ANY_EU_WORKING_CS_STAGE, total);
1256 }
1257 );
1258 counter("% Time EFUs Working", Counter::Units::Percent, [=]() {
1259 /* Percentage of time the EFUs are working while the Shaders are busy. */
1260 /* Countables:
1261 * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES
1262 * PERFCOUNTER_GROUP_SP::COUNTABLE_2 = PERF_SP_EFU_WORKING_CYCLES
1263 */
1264 return percent(PERF_SP_EFU_WORKING_CYCLES, PERF_SP_BUSY_CYCLES);
1265 }
1266 );
1267 counter("% Time Shading Fragments", Counter::Units::Percent, [=]() {
1268 /* Amount of time spent shading fragments compared to the total time spent shading everything. */
1269 /* Countables:
1270 * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE
1271 * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1272 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1273 * PERFCOUNTER_GROUP_SP::COUNTABLE_78 = PERF_SP_ANY_EU_WORKING_CS_STAGE
1274 * Notes:
1275 * - CS_STAGE amount is also counted in FS_STAGE, so fragment time has to be retrieved
1276 * through subtraction and the compute time shouldn't be summed into the total value.
1277 */
1278 int64_t fragments = PERF_SP_ANY_EU_WORKING_FS_STAGE - PERF_SP_ANY_EU_WORKING_CS_STAGE;
1279 int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE +
1280 cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE);
1281 return percent(fragments, total);
1282 }
1283 );
1284 counter("% Time Shading Vertices", Counter::Units::Percent, [=]() {
1285 /* Amount of time spent shading vertices compared to the total time spent shading everything. */
1286 /* Countables:
1287 * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE
1288 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE
1289 * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1290 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE
1291 * Notes:
1292 * - CS_STAGE amount is also counted in FS_STAGE, so it shouldn't be summed into the total value.
1293 */
1294 int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE +
1295 cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE);
1296 return percent(cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE), total);
1297 }
1298 );
1299 counter("Vertex Instructions / Second", Counter::Units::None, [=]() {
1300 /* Total number of scalar vertex shader instructions issued, per second. */
1301 /* Countables:
1302 * PERFCOUNTER_GROUP_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS
1303 * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
1304 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS
1305 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS
1306 * Notes:
1307 - Numerator has to be multiplied by four.
1308 */
1309 return (4 * (cbSum(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS) + cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS))) / time;
1310 }
1311 );
1312 counter("Vertices Shaded / Second", Counter::Units::None, [=]() {
1313 /* Number of vertices submitted to the shader engine, per second. */
1314 /* Countables:
1315 * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1316 * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS
1317 */
1318 return cbSum(PERF_PC_VS_INVOCATIONS) / time;
1319 }
1320 );
1321 disabledCounter("% Wave Context Occupancy", Counter::Units::Percent, [=]() {
1322 /* Average percentage of wave context occupancy per cycle. */
1323 /* Countables:
1324 * PERFCOUNTER_GROUP_SP::COUNTABLE_8 = PERF_SP_WAVE_CONTEXTS
1325 * PERFCOUNTER_GROUP_SP::COUNTABLE_9 = PERF_SP_WAVE_CONTEXT_CYCLES
1326 * Note:
1327 * - FIXME: disabled due to lack of SP counter capacity
1328 * - the quotient has to be divided by the number of execution wave slots per SP (16 on a7xx)
1329 * - Equation: (PERF_SP_WAVE_CONTEXTS / PERF_SP_WAVE_CONTEXT_CYCLES) / number_of_execution_wave_slots_per_sp;
1330 */
1331 return 42;
1332 }
1333 );
1334
1335 /**
1336 * GPU Stalls
1337 */
1338 counter("% BVH Fetch Stall", Counter::Units::Percent, [=]() {
1339 /* Percentage of clock cycles where the RTU could not make any more requests for BVH fetch from scheduler. */
1340 /* Countables:
1341 * PERFCOUNTER_GROUP_SP::COUNTABLE_150 = PERF_SP_SCH_STALL_CYCLES_RTU
1342 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1343 * Notes:
1344 * - TODO: provisional implementation, wasn't able to verify.
1345 */
1346 return percent(PERF_SP_SCH_STALL_CYCLES_RTU, PERF_RBBM_STATUS_MASKED);
1347 }
1348 );
1349 counter("% Instruction Cache Miss", Counter::Units::Percent, [=]() {
1350 /* Number of L1 instruction cache misses divided by L1 instruction cache requests. */
1351 /* Countables:
1352 * PERFCOUNTER_GROUP_SP::COUNTABLE_51 = PERF_SP_ICL1_REQUESTS
1353 * PERFCOUNTER_GROUP_SP::COUNTABLE_52 = PERF_SP_ICL1_MISSES
1354 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_51 = PERF_SP_ICL1_REQUESTS
1355 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_52 = PERF_SP_ICL1_MISSES
1356 */
1357 return percent(cbSum(PERF_SP_ICL1_MISSES), cbSum(PERF_SP_ICL1_REQUESTS));
1358 }
1359 );
1360 counter("L1 Texture Cache Miss Per Pixel", Counter::Units::None, [=]() {
1361 /* Average number of Texture L1 cache misses per pixel. */
1362 /* Countables:
1363 * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
1364 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
1365 * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS
1366 */
1367 return safe_div(cbSum(PERF_TP_L1_CACHELINE_MISSES), PERF_SP_PIXELS);
1368 }
1369 );
1370 counter("% Stalled On System Memory", Counter::Units::Percent, [=]() {
1371 /* Percentage of cycles the L2 cache is stalled waiting for data from system memory. */
1372 /* Countables:
1373 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_1 = PERF_UCHE_STALL_CYCLES_ARBITER
1374 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1375 * Notes:
1376 * - denominator has to be multiplied by four, for unknown reasons.
1377 */
1378 return safe_div(PERF_UCHE_STALL_CYCLES_ARBITER, 4 * PERF_RBBM_STATUS_MASKED);
1379 }
1380 );
1381 counter("% Texture Fetch Stall", Counter::Units::Percent, [=]() {
1382 /* Percentage of clock cycles where the shader processors cannot make any more requests for texture data. */
1383 /* Countables:
1384 * PERFCOUNTER_GROUP_SP::COUNTABLE_4 = PERF_SP_STALL_CYCLES_TP
1385 * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_4 = PERF_SP_STALL_CYCLES_TP
1386 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1387 */
1388 return percent(cbSum(PERF_SP_STALL_CYCLES_TP), number_of_usptp * PERF_RBBM_STATUS_MASKED);
1389 }
1390 );
1391 counter("% Texture L1 Miss", Counter::Units::Percent, [=]() {
1392 /* Number of L1 texture cache misses divided by L1 texture cache requests. */
1393 /* Countables:
1394 * PERFCOUNTER_GROUP_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS
1395 * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
1396 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS
1397 * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES
1398 */
1399 return percent(cbSum(PERF_TP_L1_CACHELINE_MISSES), cbSum(PERF_TP_L1_CACHELINE_REQUESTS));
1400 }
1401 );
1402 counter("% Texture L2 Miss", Counter::Units::Percent, [=]() {
1403 /* Number of L2 texture cache misses divided by L2 texture cache requests. */
1404 /* Countables:
1405 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP
1406 * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP
1407 * Notes:
1408 * - ratio has to be multiplied by two. Unsure how this constant comes up.
1409 */
1410 return percent(2 * PERF_UCHE_VBIF_READ_BEATS_TP, PERF_UCHE_READ_REQUESTS_TP);
1411 }
1412 );
1413 counter("% Vertex Fetch Stall", Counter::Units::Percent, [=]() {
1414 /* Percentage of clock cycles where the GPU cannot make any more requests for vertex data. */
1415 /* Countables:
1416 * PERFCOUNTER_GROUP_PC::COUNTABLE_2 = PERF_PC_STALL_CYCLES_VFD
1417 * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_2 = PERF_PC_STALL_CYCLES_VFD
1418 * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED
1419 */
1420 return percent(cbSum(PERF_PC_STALL_CYCLES_VFD), PERF_RBBM_STATUS_MASKED);
1421 }
1422 );
1423 }
1424
1425 /**
1426 * Generate an submit the cmdstream to configure the counter/countable
1427 * muxing
1428 */
1429 void
configure_counters(bool reset,bool wait)1430 FreedrenoDriver::configure_counters(bool reset, bool wait)
1431 {
1432 struct fd_submit *submit = fd_submit_new(pipe);
1433 enum fd_ringbuffer_flags flags =
1434 (enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);
1435 struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0x1000, flags);
1436
1437 for (const auto &countable : countables)
1438 countable.configure(ring, reset);
1439
1440 struct fd_fence *fence = fd_submit_flush(submit, -1, false);
1441
1442 fd_fence_flush(fence);
1443 fd_fence_del(fence);
1444
1445 fd_ringbuffer_del(ring);
1446 fd_submit_del(submit);
1447
1448 if (wait)
1449 fd_pipe_wait(pipe, fence);
1450 }
1451
1452 /**
1453 * Read the current counter values and record the time.
1454 */
1455 void
collect_countables()1456 FreedrenoDriver::collect_countables()
1457 {
1458 last_dump_ts = perfetto::base::GetBootTimeNs().count();
1459
1460 for (const auto &countable : countables)
1461 countable.collect();
1462 }
1463
1464 bool
init_perfcnt()1465 FreedrenoDriver::init_perfcnt()
1466 {
1467 uint64_t val;
1468
1469 if (dev)
1470 return true;
1471
1472 dev = fd_device_new(drm_device.fd);
1473 pipe = fd_pipe_new2(dev, FD_PIPE_3D, 0);
1474 dev_id = fd_pipe_dev_id(pipe);
1475
1476 if (fd_pipe_get_param(pipe, FD_MAX_FREQ, &val)) {
1477 PERFETTO_FATAL("Could not get MAX_FREQ");
1478 return false;
1479 }
1480 max_freq = val;
1481
1482 if (fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val)) {
1483 PERFETTO_ILOG("Could not get SUSPEND_COUNT");
1484 } else {
1485 suspend_count = val;
1486 has_suspend_count = true;
1487 }
1488
1489 fd_pipe_set_param(pipe, FD_SYSPROF, 1);
1490
1491 perfcntrs = fd_perfcntrs(fd_pipe_dev_id(pipe), &num_perfcntrs);
1492 if (num_perfcntrs == 0) {
1493 PERFETTO_FATAL("No hw counters available");
1494 return false;
1495 }
1496
1497 assigned_counters.resize(num_perfcntrs);
1498 assigned_counters.assign(assigned_counters.size(), 0);
1499
1500 info = fd_dev_info_raw(dev_id);
1501
1502 switch (fd_dev_gen(dev_id)) {
1503 case 6:
1504 setup_a6xx_counters();
1505 break;
1506 case 7:
1507 setup_a7xx_counters();
1508 break;
1509 default:
1510 PERFETTO_FATAL("Unsupported GPU: a%03u", fd_dev_gpu_id(dev_id));
1511 return false;
1512 }
1513
1514 state.resize(next_countable_id);
1515
1516 for (const auto &countable : countables)
1517 countable.resolve();
1518
1519 io = fd_dt_find_io();
1520 if (!io) {
1521 PERFETTO_FATAL("Could not map GPU I/O space");
1522 return false;
1523 }
1524
1525 configure_counters(true, true);
1526 collect_countables();
1527
1528 return true;
1529 }
1530
1531 void
enable_counter(const uint32_t counter_id)1532 FreedrenoDriver::enable_counter(const uint32_t counter_id)
1533 {
1534 enabled_counters.push_back(counters[counter_id]);
1535 }
1536
1537 void
enable_all_counters()1538 FreedrenoDriver::enable_all_counters()
1539 {
1540 enabled_counters.reserve(counters.size());
1541 for (auto &counter : counters) {
1542 enabled_counters.push_back(counter);
1543 }
1544 }
1545
1546 void
enable_perfcnt(const uint64_t)1547 FreedrenoDriver::enable_perfcnt(const uint64_t /* sampling_period_ns */)
1548 {
1549 }
1550
1551 bool
dump_perfcnt()1552 FreedrenoDriver::dump_perfcnt()
1553 {
1554 if (has_suspend_count) {
1555 uint64_t val;
1556
1557 fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val);
1558
1559 if (suspend_count != val) {
1560 PERFETTO_ILOG("Device had suspended!");
1561
1562 suspend_count = val;
1563
1564 configure_counters(true, true);
1565 collect_countables();
1566
1567 /* We aren't going to have anything sensible by comparing
1568 * current values to values from prior to the suspend, so
1569 * just skip this sampling period.
1570 */
1571 return false;
1572 }
1573 }
1574
1575 auto last_ts = last_dump_ts;
1576
1577 /* Capture the timestamp from the *start* of the sampling period: */
1578 last_capture_ts = last_dump_ts;
1579
1580 collect_countables();
1581
1582 auto elapsed_time_ns = last_dump_ts - last_ts;
1583
1584 time = (float)elapsed_time_ns / 1000000000.0;
1585
1586 /* On older kernels that dont' support querying the suspend-
1587 * count, just send configuration cmdstream regularly to keep
1588 * the GPU alive and correctly configured for the countables
1589 * we want
1590 */
1591 if (!has_suspend_count) {
1592 configure_counters(false, false);
1593 }
1594
1595 return true;
1596 }
1597
next()1598 uint64_t FreedrenoDriver::next()
1599 {
1600 auto ret = last_capture_ts;
1601 last_capture_ts = 0;
1602 return ret;
1603 }
1604
disable_perfcnt()1605 void FreedrenoDriver::disable_perfcnt()
1606 {
1607 /* There isn't really any disable, only reconfiguring which countables
1608 * get muxed to which counters
1609 */
1610 }
1611
1612 /*
1613 * Countable
1614 */
1615
1616 FreedrenoDriver::Countable
countable(std::string group,std::string name)1617 FreedrenoDriver::countable(std::string group, std::string name)
1618 {
1619 auto countable = Countable(this, group, name);
1620 countables.emplace_back(countable);
1621 return countable;
1622 }
1623
Countable(FreedrenoDriver * d,std::string group,std::string name)1624 FreedrenoDriver::Countable::Countable(FreedrenoDriver *d, std::string group, std::string name)
1625 : id {d->next_countable_id++}, d {d}, group {group}, name {name}
1626 {
1627 }
1628
1629 /* Emit register writes on ring to configure counter/countable muxing: */
1630 void
configure(struct fd_ringbuffer * ring,bool reset) const1631 FreedrenoDriver::Countable::configure(struct fd_ringbuffer *ring, bool reset) const
1632 {
1633 const struct fd_perfcntr_countable *countable = d->state[id].countable;
1634 const struct fd_perfcntr_counter *counter = d->state[id].counter;
1635
1636 OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
1637
1638 if (counter->enable && reset) {
1639 OUT_PKT4(ring, counter->enable, 1);
1640 OUT_RING(ring, 0);
1641 }
1642
1643 if (counter->clear && reset) {
1644 OUT_PKT4(ring, counter->clear, 1);
1645 OUT_RING(ring, 1);
1646
1647 OUT_PKT4(ring, counter->clear, 1);
1648 OUT_RING(ring, 0);
1649 }
1650
1651 OUT_PKT4(ring, counter->select_reg, 1);
1652 OUT_RING(ring, countable->selector);
1653
1654 if (counter->enable && reset) {
1655 OUT_PKT4(ring, counter->enable, 1);
1656 OUT_RING(ring, 1);
1657 }
1658 }
1659
1660 /* Collect current counter value and calculate delta since last sample: */
1661 void
collect() const1662 FreedrenoDriver::Countable::collect() const
1663 {
1664 const struct fd_perfcntr_counter *counter = d->state[id].counter;
1665
1666 d->state[id].last_value = d->state[id].value;
1667
1668 /* this is true on a5xx and later */
1669 assert(counter->counter_reg_lo + 1 == counter->counter_reg_hi);
1670 uint64_t *reg = (uint64_t *)((uint32_t *)d->io + counter->counter_reg_lo);
1671
1672 d->state[id].value = *reg;
1673 }
1674
1675 /* Resolve the countable and assign next counter from it's group: */
1676 void
resolve() const1677 FreedrenoDriver::Countable::resolve() const
1678 {
1679 for (unsigned i = 0; i < d->num_perfcntrs; i++) {
1680 const struct fd_perfcntr_group *g = &d->perfcntrs[i];
1681 if (group != g->name)
1682 continue;
1683
1684 for (unsigned j = 0; j < g->num_countables; j++) {
1685 const struct fd_perfcntr_countable *c = &g->countables[j];
1686 if (name != c->name)
1687 continue;
1688
1689 d->state[id].countable = c;
1690
1691 /* Assign a counter from the same group: */
1692 assert(d->assigned_counters[i] < g->num_counters);
1693 d->state[id].counter = &g->counters[d->assigned_counters[i]++];
1694
1695 std::cout << "Countable: " << name << ", group=" << g->name <<
1696 ", counter=" << d->assigned_counters[i] - 1 << "\n";
1697
1698 return;
1699 }
1700 }
1701 unreachable("no such countable!");
1702 }
1703
1704 uint64_t
get_value() const1705 FreedrenoDriver::Countable::get_value() const
1706 {
1707 return d->state[id].value - d->state[id].last_value;
1708 }
1709
1710 /*
1711 * DerivedCounter
1712 */
1713
DerivedCounter(FreedrenoDriver * d,std::string name,Counter::Units units,std::function<int64_t ()> derive)1714 FreedrenoDriver::DerivedCounter::DerivedCounter(FreedrenoDriver *d, std::string name,
1715 Counter::Units units,
1716 std::function<int64_t()> derive)
1717 : Counter(d->next_counter_id++, name, 0)
1718 {
1719 std::cout << "DerivedCounter: " << name << ", id=" << id << "\n";
1720 this->units = units;
1721 set_getter([=](const Counter &c, const Driver &d) {
1722 return derive();
1723 }
1724 );
1725 }
1726
1727 FreedrenoDriver::DerivedCounter
counter(std::string name,Counter::Units units,std::function<int64_t ()> derive)1728 FreedrenoDriver::counter(std::string name, Counter::Units units,
1729 std::function<int64_t()> derive)
1730 {
1731 auto counter = DerivedCounter(this, name, units, derive);
1732 counters.emplace_back(counter);
1733 return counter;
1734 }
1735
1736 uint32_t
gpu_clock_id() const1737 FreedrenoDriver::gpu_clock_id() const
1738 {
1739 return perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
1740 }
1741
1742 uint64_t
gpu_timestamp() const1743 FreedrenoDriver::gpu_timestamp() const
1744 {
1745 return perfetto::base::GetBootTimeNs().count();
1746 }
1747
1748 bool
cpu_gpu_timestamp(uint64_t &,uint64_t &) const1749 FreedrenoDriver::cpu_gpu_timestamp(uint64_t &, uint64_t &) const
1750 {
1751 /* Not supported */
1752 return false;
1753 }
1754
1755 } // namespace pps
1756