1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "util/u_memory.h"
28
29 enum si_pc_block_flags
30 {
31 /* This block is part of the shader engine */
32 SI_PC_BLOCK_SE = (1 << 0),
33
34 /* Expose per-instance groups instead of summing all instances (within
35 * an SE). */
36 SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
37
38 /* Expose per-SE groups instead of summing instances across SEs. */
39 SI_PC_BLOCK_SE_GROUPS = (1 << 2),
40
41 /* Shader block */
42 SI_PC_BLOCK_SHADER = (1 << 3),
43
44 /* Non-shader block with perfcounters windowed by shaders. */
45 SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
46 };
47
48 enum si_pc_reg_layout
49 {
50 /* All secondary selector dwords follow as one block after the primary
51 * selector dwords for the counters that have secondary selectors.
52 *
53 * Example:
54 * PERFCOUNTER0_SELECT
55 * PERFCOUNTER1_SELECT
56 * PERFCOUNTER0_SELECT1
57 * PERFCOUNTER1_SELECT1
58 * PERFCOUNTER2_SELECT
59 * PERFCOUNTER3_SELECT
60 */
61 SI_PC_MULTI_BLOCK = 0,
62
63 /* Each secondary selector dword follows immediately after the
64 * corresponding primary.
65 *
66 * Example:
67 * PERFCOUNTER0_SELECT
68 * PERFCOUNTER0_SELECT1
69 * PERFCOUNTER1_SELECT
70 * PERFCOUNTER1_SELECT1
71 * PERFCOUNTER2_SELECT
72 * PERFCOUNTER3_SELECT
73 */
74 SI_PC_MULTI_ALTERNATE = 1,
75
76 /* All secondary selector dwords follow as one block after all primary
77 * selector dwords.
78 *
79 * Example:
80 * PERFCOUNTER0_SELECT
81 * PERFCOUNTER1_SELECT
82 * PERFCOUNTER2_SELECT
83 * PERFCOUNTER3_SELECT
84 * PERFCOUNTER0_SELECT1
85 * PERFCOUNTER1_SELECT1
86 */
87 SI_PC_MULTI_TAIL = 2,
88
89 /* Free-form arrangement of selector registers. */
90 SI_PC_MULTI_CUSTOM = 3,
91
92 SI_PC_MULTI_MASK = 3,
93
94 /* Registers are laid out in decreasing rather than increasing order. */
95 SI_PC_REG_REVERSE = 4,
96
97 SI_PC_FAKE = 8,
98 };
99
100 struct si_pc_block_base {
101 const char *name;
102 unsigned num_counters;
103 unsigned flags;
104
105 unsigned select_or;
106 unsigned select0;
107 unsigned counter0_lo;
108 unsigned *select;
109 unsigned *counters;
110 unsigned num_multi;
111 unsigned num_prelude;
112 unsigned layout;
113 };
114
115 struct si_pc_block_gfxdescr {
116 struct si_pc_block_base *b;
117 unsigned selectors;
118 unsigned instances;
119 };
120
121 struct si_pc_block {
122 const struct si_pc_block_gfxdescr *b;
123 unsigned num_instances;
124
125 unsigned num_groups;
126 char *group_names;
127 unsigned group_name_stride;
128
129 char *selector_names;
130 unsigned selector_name_stride;
131 };
132
133 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
134 * performance counter group IDs.
135 */
136 static const char *const si_pc_shader_type_suffixes[] = {"", "_ES", "_GS", "_VS",
137 "_PS", "_LS", "_HS", "_CS"};
138
139 static const unsigned si_pc_shader_type_bits[] = {
140 0x7f,
141 S_036780_ES_EN(1),
142 S_036780_GS_EN(1),
143 S_036780_VS_EN(1),
144 S_036780_PS_EN(1),
145 S_036780_LS_EN(1),
146 S_036780_HS_EN(1),
147 S_036780_CS_EN(1),
148 };
149
150 /* Max counters per HW block */
151 #define SI_QUERY_MAX_COUNTERS 16
152
153 #define SI_PC_SHADERS_WINDOWING (1u << 31)
154
155 struct si_query_group {
156 struct si_query_group *next;
157 struct si_pc_block *block;
158 unsigned sub_gid; /* only used during init */
159 unsigned result_base; /* only used during init */
160 int se;
161 int instance;
162 unsigned num_counters;
163 unsigned selectors[SI_QUERY_MAX_COUNTERS];
164 };
165
166 struct si_query_counter {
167 unsigned base;
168 unsigned qwords;
169 unsigned stride; /* in uint64s */
170 };
171
172 struct si_query_pc {
173 struct si_query b;
174 struct si_query_buffer buffer;
175
176 /* Size of the results in memory, in bytes. */
177 unsigned result_size;
178
179 unsigned shaders;
180 unsigned num_counters;
181 struct si_query_counter *counters;
182 struct si_query_group *groups;
183 };
184
185 static struct si_pc_block_base cik_CB = {
186 .name = "CB",
187 .num_counters = 4,
188 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
189
190 .select0 = R_037000_CB_PERFCOUNTER_FILTER,
191 .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
192 .num_multi = 1,
193 .num_prelude = 1,
194 .layout = SI_PC_MULTI_ALTERNATE,
195 };
196
197 static unsigned cik_CPC_select[] = {
198 R_036024_CPC_PERFCOUNTER0_SELECT,
199 R_036010_CPC_PERFCOUNTER0_SELECT1,
200 R_03600C_CPC_PERFCOUNTER1_SELECT,
201 };
202 static struct si_pc_block_base cik_CPC = {
203 .name = "CPC",
204 .num_counters = 2,
205
206 .select = cik_CPC_select,
207 .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
208 .num_multi = 1,
209 .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
210 };
211
212 static struct si_pc_block_base cik_CPF = {
213 .name = "CPF",
214 .num_counters = 2,
215
216 .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
217 .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
218 .num_multi = 1,
219 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
220 };
221
222 static struct si_pc_block_base cik_CPG = {
223 .name = "CPG",
224 .num_counters = 2,
225
226 .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
227 .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
228 .num_multi = 1,
229 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
230 };
231
232 static struct si_pc_block_base cik_DB = {
233 .name = "DB",
234 .num_counters = 4,
235 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
236
237 .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
238 .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
239 .num_multi = 3, // really only 2, but there's a gap between registers
240 .layout = SI_PC_MULTI_ALTERNATE,
241 };
242
243 static struct si_pc_block_base cik_GDS = {
244 .name = "GDS",
245 .num_counters = 4,
246
247 .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
248 .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
249 .num_multi = 1,
250 .layout = SI_PC_MULTI_TAIL,
251 };
252
253 static unsigned cik_GRBM_counters[] = {
254 R_034100_GRBM_PERFCOUNTER0_LO,
255 R_03410C_GRBM_PERFCOUNTER1_LO,
256 };
257 static struct si_pc_block_base cik_GRBM = {
258 .name = "GRBM",
259 .num_counters = 2,
260
261 .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
262 .counters = cik_GRBM_counters,
263 };
264
265 static struct si_pc_block_base cik_GRBMSE = {
266 .name = "GRBMSE",
267 .num_counters = 4,
268
269 .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
270 .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
271 };
272
273 static struct si_pc_block_base cik_IA = {
274 .name = "IA",
275 .num_counters = 4,
276
277 .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
278 .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
279 .num_multi = 1,
280 .layout = SI_PC_MULTI_TAIL,
281 };
282
283 static struct si_pc_block_base cik_PA_SC = {
284 .name = "PA_SC",
285 .num_counters = 8,
286 .flags = SI_PC_BLOCK_SE,
287
288 .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
289 .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
290 .num_multi = 1,
291 .layout = SI_PC_MULTI_ALTERNATE,
292 };
293
294 /* According to docs, PA_SU counters are only 48 bits wide. */
295 static struct si_pc_block_base cik_PA_SU = {
296 .name = "PA_SU",
297 .num_counters = 4,
298 .flags = SI_PC_BLOCK_SE,
299
300 .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
301 .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
302 .num_multi = 2,
303 .layout = SI_PC_MULTI_ALTERNATE,
304 };
305
306 static struct si_pc_block_base cik_SPI = {
307 .name = "SPI",
308 .num_counters = 6,
309 .flags = SI_PC_BLOCK_SE,
310
311 .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
312 .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
313 .num_multi = 4,
314 .layout = SI_PC_MULTI_BLOCK,
315 };
316
317 static struct si_pc_block_base cik_SQ = {
318 .name = "SQ",
319 .num_counters = 16,
320 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
321
322 .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
323 .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
324 .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
325 };
326
327 static struct si_pc_block_base cik_SX = {
328 .name = "SX",
329 .num_counters = 4,
330 .flags = SI_PC_BLOCK_SE,
331
332 .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
333 .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
334 .num_multi = 2,
335 .layout = SI_PC_MULTI_TAIL,
336 };
337
338 static struct si_pc_block_base cik_TA = {
339 .name = "TA",
340 .num_counters = 2,
341 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
342
343 .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
344 .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
345 .num_multi = 1,
346 .layout = SI_PC_MULTI_ALTERNATE,
347 };
348
349 static struct si_pc_block_base cik_TD = {
350 .name = "TD",
351 .num_counters = 2,
352 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
353
354 .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
355 .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
356 .num_multi = 1,
357 .layout = SI_PC_MULTI_ALTERNATE,
358 };
359
360 static struct si_pc_block_base cik_TCA = {
361 .name = "TCA",
362 .num_counters = 4,
363 .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
364
365 .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
366 .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
367 .num_multi = 2,
368 .layout = SI_PC_MULTI_ALTERNATE,
369 };
370
371 static struct si_pc_block_base cik_TCC = {
372 .name = "TCC",
373 .num_counters = 4,
374 .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
375
376 .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
377 .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
378 .num_multi = 2,
379 .layout = SI_PC_MULTI_ALTERNATE,
380 };
381
382 static struct si_pc_block_base cik_TCP = {
383 .name = "TCP",
384 .num_counters = 4,
385 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
386
387 .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
388 .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
389 .num_multi = 2,
390 .layout = SI_PC_MULTI_ALTERNATE,
391 };
392
393 static struct si_pc_block_base cik_VGT = {
394 .name = "VGT",
395 .num_counters = 4,
396 .flags = SI_PC_BLOCK_SE,
397
398 .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
399 .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
400 .num_multi = 1,
401 .layout = SI_PC_MULTI_TAIL,
402 };
403
404 static struct si_pc_block_base cik_WD = {
405 .name = "WD",
406 .num_counters = 4,
407
408 .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
409 .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
410 };
411
412 static struct si_pc_block_base cik_MC = {
413 .name = "MC",
414 .num_counters = 4,
415
416 .layout = SI_PC_FAKE,
417 };
418
419 static struct si_pc_block_base cik_SRBM = {
420 .name = "SRBM",
421 .num_counters = 2,
422
423 .layout = SI_PC_FAKE,
424 };
425
426 static struct si_pc_block_base gfx10_CHA = {
427 .name = "CHA",
428 .num_counters = 4,
429
430 .select0 = R_037780_CHA_PERFCOUNTER0_SELECT,
431 .counter0_lo = R_035800_CHA_PERFCOUNTER0_LO,
432 .num_multi = 1,
433 .layout = SI_PC_MULTI_ALTERNATE,
434 };
435
436 static struct si_pc_block_base gfx10_CHCG = {
437 .name = "CHCG",
438 .num_counters = 4,
439
440 .select0 = R_036F18_CHCG_PERFCOUNTER0_SELECT,
441 .counter0_lo = R_034F20_CHCG_PERFCOUNTER0_LO,
442 .num_multi = 1,
443 .layout = SI_PC_MULTI_ALTERNATE,
444 };
445
446 static struct si_pc_block_base gfx10_CHC = {
447 .name = "CHC",
448 .num_counters = 4,
449
450 .select0 = R_036F00_CHC_PERFCOUNTER0_SELECT,
451 .counter0_lo = R_034F00_CHC_PERFCOUNTER0_LO,
452 .num_multi = 1,
453 .layout = SI_PC_MULTI_ALTERNATE,
454 };
455
456 static struct si_pc_block_base gfx10_GCR = {
457 .name = "GCR",
458 .num_counters = 2,
459
460 .select0 = R_037580_GCR_PERFCOUNTER0_SELECT,
461 .counter0_lo = R_035480_GCR_PERFCOUNTER0_LO,
462 .num_multi = 1,
463 .layout = SI_PC_MULTI_ALTERNATE,
464 };
465
466 static struct si_pc_block_base gfx10_GE = {
467 .name = "GE",
468 .num_counters = 12,
469
470 .select0 = R_036200_GE_PERFCOUNTER0_SELECT,
471 .counter0_lo = R_034200_GE_PERFCOUNTER0_LO,
472 .num_multi = 4,
473 .layout = SI_PC_MULTI_ALTERNATE,
474 };
475
476 static struct si_pc_block_base gfx10_GL1A = {
477 .name = "GL1A",
478 .num_counters = 4,
479 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
480
481 .select0 = R_037700_GL1A_PERFCOUNTER0_SELECT,
482 .counter0_lo = R_035700_GL1A_PERFCOUNTER0_LO,
483 .num_multi = 1,
484 .layout = SI_PC_MULTI_ALTERNATE,
485 };
486
487 static struct si_pc_block_base gfx10_GL1C = {
488 .name = "GL1C",
489 .num_counters = 4,
490 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
491
492 .select0 = R_036E80_GL1C_PERFCOUNTER0_SELECT,
493 .counter0_lo = R_034E80_GL1C_PERFCOUNTER0_LO,
494 .num_multi = 1,
495 .layout = SI_PC_MULTI_ALTERNATE,
496 };
497
498 static struct si_pc_block_base gfx10_GL2A = {
499 .name = "GL2A",
500 .num_counters = 4,
501
502 .select0 = R_036E40_GL2A_PERFCOUNTER0_SELECT,
503 .counter0_lo = R_034E40_GL2A_PERFCOUNTER0_LO,
504 .num_multi = 2,
505 .layout = SI_PC_MULTI_ALTERNATE,
506 };
507
508 static struct si_pc_block_base gfx10_GL2C = {
509 .name = "GL2C",
510 .num_counters = 4,
511
512 .select0 = R_036E00_GL2C_PERFCOUNTER0_SELECT,
513 .counter0_lo = R_034E00_GL2C_PERFCOUNTER0_LO,
514 .num_multi = 2,
515 .layout = SI_PC_MULTI_ALTERNATE,
516 };
517
518 static unsigned gfx10_PA_PH_select[] = {
519 R_037600_PA_PH_PERFCOUNTER0_SELECT,
520 R_037604_PA_PH_PERFCOUNTER0_SELECT1,
521 R_037608_PA_PH_PERFCOUNTER1_SELECT,
522 R_037640_PA_PH_PERFCOUNTER1_SELECT1,
523 R_03760C_PA_PH_PERFCOUNTER2_SELECT,
524 R_037644_PA_PH_PERFCOUNTER2_SELECT1,
525 R_037610_PA_PH_PERFCOUNTER3_SELECT,
526 R_037648_PA_PH_PERFCOUNTER3_SELECT1,
527 R_037614_PA_PH_PERFCOUNTER4_SELECT,
528 R_037618_PA_PH_PERFCOUNTER5_SELECT,
529 R_03761C_PA_PH_PERFCOUNTER6_SELECT,
530 R_037620_PA_PH_PERFCOUNTER7_SELECT,
531 };
532 static struct si_pc_block_base gfx10_PA_PH = {
533 .name = "PA_PH",
534 .num_counters = 8,
535 .flags = SI_PC_BLOCK_SE,
536
537 .select = gfx10_PA_PH_select,
538 .counter0_lo = R_035600_PA_PH_PERFCOUNTER0_LO,
539 .num_multi = 4,
540 .layout = SI_PC_MULTI_CUSTOM,
541 };
542
543 static struct si_pc_block_base gfx10_PA_SU = {
544 .name = "PA_SU",
545 .num_counters = 4,
546 .flags = SI_PC_BLOCK_SE,
547
548 .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
549 .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
550 .num_multi = 4,
551 .layout = SI_PC_MULTI_ALTERNATE,
552 };
553
554 static struct si_pc_block_base gfx10_RLC = {
555 .name = "RLC",
556 .num_counters = 2,
557
558 .select0 = R_037304_RLC_PERFCOUNTER0_SELECT,
559 .counter0_lo = R_035200_RLC_PERFCOUNTER0_LO,
560 .num_multi = 0,
561 .layout = SI_PC_MULTI_ALTERNATE,
562 };
563
564 static struct si_pc_block_base gfx10_RMI = {
565 .name = "RMI",
566 /* Actually 4, but the 2nd counter is missing the secondary selector while
567 * the 3rd counter has it, which complicates the register layout. */
568 .num_counters = 2,
569 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
570
571 .select0 = R_037400_RMI_PERFCOUNTER0_SELECT,
572 .counter0_lo = R_035300_RMI_PERFCOUNTER0_LO,
573 .num_multi = 1,
574 .layout = SI_PC_MULTI_ALTERNATE,
575 };
576
577 static struct si_pc_block_base gfx10_UTCL1 = {
578 .name = "UTCL1",
579 .num_counters = 2,
580 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
581
582 .select0 = R_03758C_UTCL1_PERFCOUNTER0_SELECT,
583 .counter0_lo = R_035470_UTCL1_PERFCOUNTER0_LO,
584 .num_multi = 0,
585 .layout = SI_PC_MULTI_ALTERNATE,
586 };
587
588 /* Both the number of instances and selectors varies between chips of the same
589 * class. We only differentiate by class here and simply expose the maximum
590 * number over all chips in a class.
591 *
592 * Unfortunately, GPUPerfStudio uses the order of performance counter groups
593 * blindly once it believes it has identified the hardware, so the order of
594 * blocks here matters.
595 */
596 static struct si_pc_block_gfxdescr groups_CIK[] = {
597 {&cik_CB, 226}, {&cik_CPF, 17}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15},
598 {&cik_PA_SU, 153}, {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252}, {&cik_SX, 32},
599 {&cik_TA, 111}, {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55}, {&cik_TCP, 154},
600 {&cik_GDS, 121}, {&cik_VGT, 140}, {&cik_IA, 22}, {&cik_MC, 22}, {&cik_SRBM, 19},
601 {&cik_WD, 22}, {&cik_CPG, 46}, {&cik_CPC, 22},
602
603 };
604
605 static struct si_pc_block_gfxdescr groups_VI[] = {
606 {&cik_CB, 405}, {&cik_CPF, 19}, {&cik_DB, 257}, {&cik_GRBM, 34}, {&cik_GRBMSE, 15},
607 {&cik_PA_SU, 154}, {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273}, {&cik_SX, 34},
608 {&cik_TA, 119}, {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55}, {&cik_TCP, 180},
609 {&cik_GDS, 121}, {&cik_VGT, 147}, {&cik_IA, 24}, {&cik_MC, 22}, {&cik_SRBM, 27},
610 {&cik_WD, 37}, {&cik_CPG, 48}, {&cik_CPC, 24},
611
612 };
613
614 static struct si_pc_block_gfxdescr groups_gfx9[] = {
615 {&cik_CB, 438}, {&cik_CPF, 32}, {&cik_DB, 328}, {&cik_GRBM, 38}, {&cik_GRBMSE, 16},
616 {&cik_PA_SU, 292}, {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374}, {&cik_SX, 208},
617 {&cik_TA, 119}, {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57}, {&cik_TCP, 85},
618 {&cik_GDS, 121}, {&cik_VGT, 148}, {&cik_IA, 32}, {&cik_WD, 58}, {&cik_CPG, 59},
619 {&cik_CPC, 35},
620 };
621
622 static struct si_pc_block_gfxdescr groups_gfx10[] = {
623 {&cik_CB, 461},
624 {&gfx10_CHA, 45},
625 {&gfx10_CHCG, 35},
626 {&gfx10_CHC, 35},
627 {&cik_CPC, 47},
628 {&cik_CPF, 40},
629 {&cik_CPG, 82},
630 {&cik_DB, 370},
631 {&gfx10_GCR, 94},
632 {&cik_GDS, 123},
633 {&gfx10_GE, 315},
634 {&gfx10_GL1A, 36},
635 {&gfx10_GL1C, 64},
636 {&gfx10_GL2A, 91},
637 {&gfx10_GL2C, 235},
638 {&cik_GRBM, 47},
639 {&cik_GRBMSE, 19},
640 {&gfx10_PA_PH, 960},
641 {&cik_PA_SC, 552},
642 {&gfx10_PA_SU, 266},
643 {&gfx10_RLC, 7},
644 {&gfx10_RMI, 258},
645 {&cik_SPI, 329},
646 {&cik_SQ, 509},
647 {&cik_SX, 225},
648 {&cik_TA, 226},
649 {&cik_TCP, 77},
650 {&cik_TD, 61},
651 {&gfx10_UTCL1, 15},
652 };
653
si_pc_block_has_per_se_groups(const struct si_perfcounters * pc,const struct si_pc_block * block)654 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
655 const struct si_pc_block *block)
656 {
657 return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
658 (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
659 }
660
si_pc_block_has_per_instance_groups(const struct si_perfcounters * pc,const struct si_pc_block * block)661 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
662 const struct si_pc_block *block)
663 {
664 return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
665 (block->num_instances > 1 && pc->separate_instance);
666 }
667
lookup_counter(struct si_perfcounters * pc,unsigned index,unsigned * base_gid,unsigned * sub_index)668 static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index,
669 unsigned *base_gid, unsigned *sub_index)
670 {
671 struct si_pc_block *block = pc->blocks;
672 unsigned bid;
673
674 *base_gid = 0;
675 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
676 unsigned total = block->num_groups * block->b->selectors;
677
678 if (index < total) {
679 *sub_index = index;
680 return block;
681 }
682
683 index -= total;
684 *base_gid += block->num_groups;
685 }
686
687 return NULL;
688 }
689
lookup_group(struct si_perfcounters * pc,unsigned * index)690 static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index)
691 {
692 unsigned bid;
693 struct si_pc_block *block = pc->blocks;
694
695 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
696 if (*index < block->num_groups)
697 return block;
698 *index -= block->num_groups;
699 }
700
701 return NULL;
702 }
703
si_pc_emit_instance(struct si_context * sctx,int se,int instance)704 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
705 {
706 struct radeon_cmdbuf *cs = sctx->gfx_cs;
707 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
708
709 if (se >= 0) {
710 value |= S_030800_SE_INDEX(se);
711 } else {
712 value |= S_030800_SE_BROADCAST_WRITES(1);
713 }
714
715 if (sctx->chip_class >= GFX10) {
716 /* TODO: Expose counters from each shader array separately if needed. */
717 value |= S_030800_SA_BROADCAST_WRITES(1);
718 }
719
720 if (instance >= 0) {
721 value |= S_030800_INSTANCE_INDEX(instance);
722 } else {
723 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
724 }
725
726 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
727 }
728
si_pc_emit_shaders(struct si_context * sctx,unsigned shaders)729 static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
730 {
731 struct radeon_cmdbuf *cs = sctx->gfx_cs;
732
733 radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
734 radeon_emit(cs, shaders & 0x7f);
735 radeon_emit(cs, 0xffffffff);
736 }
737
si_pc_emit_select(struct si_context * sctx,struct si_pc_block * block,unsigned count,unsigned * selectors)738 static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count,
739 unsigned *selectors)
740 {
741 struct si_pc_block_base *regs = block->b->b;
742 struct radeon_cmdbuf *cs = sctx->gfx_cs;
743 unsigned idx;
744 unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
745 unsigned dw;
746
747 assert(count <= regs->num_counters);
748
749 if (regs->layout & SI_PC_FAKE)
750 return;
751
752 if (layout_multi == SI_PC_MULTI_BLOCK) {
753 assert(!(regs->layout & SI_PC_REG_REVERSE));
754
755 dw = count + regs->num_prelude;
756 if (count >= regs->num_multi)
757 dw += regs->num_multi;
758 radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
759 for (idx = 0; idx < regs->num_prelude; ++idx)
760 radeon_emit(cs, 0);
761 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
762 radeon_emit(cs, selectors[idx] | regs->select_or);
763
764 if (count < regs->num_multi) {
765 unsigned select1 = regs->select0 + 4 * regs->num_multi;
766 radeon_set_uconfig_reg_seq(cs, select1, count);
767 }
768
769 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
770 radeon_emit(cs, 0);
771
772 if (count > regs->num_multi) {
773 for (idx = regs->num_multi; idx < count; ++idx)
774 radeon_emit(cs, selectors[idx] | regs->select_or);
775 }
776 } else if (layout_multi == SI_PC_MULTI_TAIL) {
777 unsigned select1, select1_count;
778
779 assert(!(regs->layout & SI_PC_REG_REVERSE));
780
781 radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
782 for (idx = 0; idx < regs->num_prelude; ++idx)
783 radeon_emit(cs, 0);
784 for (idx = 0; idx < count; ++idx)
785 radeon_emit(cs, selectors[idx] | regs->select_or);
786
787 select1 = regs->select0 + 4 * regs->num_counters;
788 select1_count = MIN2(count, regs->num_multi);
789 radeon_set_uconfig_reg_seq(cs, select1, select1_count);
790 for (idx = 0; idx < select1_count; ++idx)
791 radeon_emit(cs, 0);
792 } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
793 unsigned *reg = regs->select;
794 for (idx = 0; idx < count; ++idx) {
795 radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
796 if (idx < regs->num_multi)
797 radeon_set_uconfig_reg(cs, *reg++, 0);
798 }
799 } else {
800 assert(layout_multi == SI_PC_MULTI_ALTERNATE);
801
802 unsigned reg_base = regs->select0;
803 unsigned reg_count = count + MIN2(count, regs->num_multi);
804 reg_count += regs->num_prelude;
805
806 if (!(regs->layout & SI_PC_REG_REVERSE)) {
807 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
808
809 for (idx = 0; idx < regs->num_prelude; ++idx)
810 radeon_emit(cs, 0);
811 for (idx = 0; idx < count; ++idx) {
812 radeon_emit(cs, selectors[idx] | regs->select_or);
813 if (idx < regs->num_multi)
814 radeon_emit(cs, 0);
815 }
816 } else {
817 reg_base -= (reg_count - 1) * 4;
818 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
819
820 for (idx = count; idx > 0; --idx) {
821 if (idx <= regs->num_multi)
822 radeon_emit(cs, 0);
823 radeon_emit(cs, selectors[idx - 1] | regs->select_or);
824 }
825 for (idx = 0; idx < regs->num_prelude; ++idx)
826 radeon_emit(cs, 0);
827 }
828 }
829 }
830
si_pc_emit_start(struct si_context * sctx,struct si_resource * buffer,uint64_t va)831 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
832 {
833 struct radeon_cmdbuf *cs = sctx->gfx_cs;
834
835 si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
836 COPY_DATA_IMM, NULL, 1);
837
838 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
839 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
840 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
841 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
842 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
843 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
844 }
845
846 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
847 * do it again in here. */
si_pc_emit_stop(struct si_context * sctx,struct si_resource * buffer,uint64_t va)848 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
849 {
850 struct radeon_cmdbuf *cs = sctx->gfx_cs;
851
852 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
853 EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
854 si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
855
856 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
857 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
858 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
859 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
860 radeon_set_uconfig_reg(
861 cs, R_036020_CP_PERFMON_CNTL,
862 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
863 }
864
si_pc_emit_read(struct si_context * sctx,struct si_pc_block * block,unsigned count,uint64_t va)865 static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count,
866 uint64_t va)
867 {
868 struct si_pc_block_base *regs = block->b->b;
869 struct radeon_cmdbuf *cs = sctx->gfx_cs;
870 unsigned idx;
871 unsigned reg = regs->counter0_lo;
872 unsigned reg_delta = 8;
873
874 if (!(regs->layout & SI_PC_FAKE)) {
875 if (regs->layout & SI_PC_REG_REVERSE)
876 reg_delta = -reg_delta;
877
878 for (idx = 0; idx < count; ++idx) {
879 if (regs->counters)
880 reg = regs->counters[idx];
881
882 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
883 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
884 COPY_DATA_COUNT_SEL); /* 64 bits */
885 radeon_emit(cs, reg >> 2);
886 radeon_emit(cs, 0); /* unused */
887 radeon_emit(cs, va);
888 radeon_emit(cs, va >> 32);
889 va += sizeof(uint64_t);
890 reg += reg_delta;
891 }
892 } else {
893 for (idx = 0; idx < count; ++idx) {
894 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
895 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
896 COPY_DATA_COUNT_SEL);
897 radeon_emit(cs, 0); /* immediate */
898 radeon_emit(cs, 0);
899 radeon_emit(cs, va);
900 radeon_emit(cs, va >> 32);
901 va += sizeof(uint64_t);
902 }
903 }
904 }
905
si_pc_query_destroy(struct si_context * sctx,struct si_query * squery)906 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
907 {
908 struct si_query_pc *query = (struct si_query_pc *)squery;
909
910 while (query->groups) {
911 struct si_query_group *group = query->groups;
912 query->groups = group->next;
913 FREE(group);
914 }
915
916 FREE(query->counters);
917
918 si_query_buffer_destroy(sctx->screen, &query->buffer);
919 FREE(query);
920 }
921
si_inhibit_clockgating(struct si_context * sctx,bool inhibit)922 static void si_inhibit_clockgating(struct si_context *sctx, bool inhibit)
923 {
924 if (sctx->chip_class >= GFX10) {
925 radeon_set_uconfig_reg(sctx->gfx_cs, R_037390_RLC_PERFMON_CLK_CNTL,
926 S_037390_PERFMON_CLOCK_STATE(inhibit));
927 } else if (sctx->chip_class >= GFX8) {
928 radeon_set_uconfig_reg(sctx->gfx_cs, R_0372FC_RLC_PERFMON_CLK_CNTL,
929 S_0372FC_PERFMON_CLOCK_STATE(inhibit));
930 }
931 }
932
si_pc_query_resume(struct si_context * sctx,struct si_query * squery)933 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
934 /*
935 struct si_query_hw *hwquery,
936 struct si_resource *buffer, uint64_t va)*/
937 {
938 struct si_query_pc *query = (struct si_query_pc *)squery;
939 int current_se = -1;
940 int current_instance = -1;
941
942 if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
943 return;
944 si_need_gfx_cs_space(sctx, 0);
945
946 if (query->shaders)
947 si_pc_emit_shaders(sctx, query->shaders);
948
949 si_inhibit_clockgating(sctx, true);
950
951 for (struct si_query_group *group = query->groups; group; group = group->next) {
952 struct si_pc_block *block = group->block;
953
954 if (group->se != current_se || group->instance != current_instance) {
955 current_se = group->se;
956 current_instance = group->instance;
957 si_pc_emit_instance(sctx, group->se, group->instance);
958 }
959
960 si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
961 }
962
963 if (current_se != -1 || current_instance != -1)
964 si_pc_emit_instance(sctx, -1, -1);
965
966 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
967 si_pc_emit_start(sctx, query->buffer.buf, va);
968 }
969
si_pc_query_suspend(struct si_context * sctx,struct si_query * squery)970 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
971 {
972 struct si_query_pc *query = (struct si_query_pc *)squery;
973
974 if (!query->buffer.buf)
975 return;
976
977 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
978 query->buffer.results_end += query->result_size;
979
980 si_pc_emit_stop(sctx, query->buffer.buf, va);
981
982 for (struct si_query_group *group = query->groups; group; group = group->next) {
983 struct si_pc_block *block = group->block;
984 unsigned se = group->se >= 0 ? group->se : 0;
985 unsigned se_end = se + 1;
986
987 if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
988 se_end = sctx->screen->info.max_se;
989
990 do {
991 unsigned instance = group->instance >= 0 ? group->instance : 0;
992
993 do {
994 si_pc_emit_instance(sctx, se, instance);
995 si_pc_emit_read(sctx, block, group->num_counters, va);
996 va += sizeof(uint64_t) * group->num_counters;
997 } while (group->instance < 0 && ++instance < block->num_instances);
998 } while (++se < se_end);
999 }
1000
1001 si_pc_emit_instance(sctx, -1, -1);
1002
1003 si_inhibit_clockgating(sctx, false);
1004 }
1005
si_pc_query_begin(struct si_context * ctx,struct si_query * squery)1006 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
1007 {
1008 struct si_query_pc *query = (struct si_query_pc *)squery;
1009
1010 si_query_buffer_reset(ctx, &query->buffer);
1011
1012 list_addtail(&query->b.active_list, &ctx->active_queries);
1013 ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1014
1015 si_pc_query_resume(ctx, squery);
1016
1017 return true;
1018 }
1019
si_pc_query_end(struct si_context * ctx,struct si_query * squery)1020 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
1021 {
1022 struct si_query_pc *query = (struct si_query_pc *)squery;
1023
1024 si_pc_query_suspend(ctx, squery);
1025
1026 list_del(&squery->active_list);
1027 ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
1028
1029 return query->buffer.buf != NULL;
1030 }
1031
si_pc_query_add_result(struct si_query_pc * query,void * buffer,union pipe_query_result * result)1032 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
1033 union pipe_query_result *result)
1034 {
1035 uint64_t *results = buffer;
1036 unsigned i, j;
1037
1038 for (i = 0; i < query->num_counters; ++i) {
1039 struct si_query_counter *counter = &query->counters[i];
1040
1041 for (j = 0; j < counter->qwords; ++j) {
1042 uint32_t value = results[counter->base + j * counter->stride];
1043 result->batch[i].u64 += value;
1044 }
1045 }
1046 }
1047
si_pc_query_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1048 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1049 union pipe_query_result *result)
1050 {
1051 struct si_query_pc *query = (struct si_query_pc *)squery;
1052
1053 memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
1054
1055 for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1056 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1057 unsigned results_base = 0;
1058 void *map;
1059
1060 if (squery->b.flushed)
1061 map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
1062 else
1063 map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
1064
1065 if (!map)
1066 return false;
1067
1068 while (results_base != qbuf->results_end) {
1069 si_pc_query_add_result(query, map + results_base, result);
1070 results_base += query->result_size;
1071 }
1072 }
1073
1074 return true;
1075 }
1076
1077 static const struct si_query_ops batch_query_ops = {
1078 .destroy = si_pc_query_destroy,
1079 .begin = si_pc_query_begin,
1080 .end = si_pc_query_end,
1081 .get_result = si_pc_query_get_result,
1082
1083 .suspend = si_pc_query_suspend,
1084 .resume = si_pc_query_resume,
1085 };
1086
get_group_state(struct si_screen * screen,struct si_query_pc * query,struct si_pc_block * block,unsigned sub_gid)1087 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
1088 struct si_pc_block *block, unsigned sub_gid)
1089 {
1090 struct si_query_group *group = query->groups;
1091
1092 while (group) {
1093 if (group->block == block && group->sub_gid == sub_gid)
1094 return group;
1095 group = group->next;
1096 }
1097
1098 group = CALLOC_STRUCT(si_query_group);
1099 if (!group)
1100 return NULL;
1101
1102 group->block = block;
1103 group->sub_gid = sub_gid;
1104
1105 if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
1106 unsigned sub_gids = block->num_instances;
1107 unsigned shader_id;
1108 unsigned shaders;
1109 unsigned query_shaders;
1110
1111 if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
1112 sub_gids = sub_gids * screen->info.max_se;
1113 shader_id = sub_gid / sub_gids;
1114 sub_gid = sub_gid % sub_gids;
1115
1116 shaders = si_pc_shader_type_bits[shader_id];
1117
1118 query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
1119 if (query_shaders && query_shaders != shaders) {
1120 fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
1121 FREE(group);
1122 return NULL;
1123 }
1124 query->shaders = shaders;
1125 }
1126
1127 if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
1128 // A non-zero value in query->shaders ensures that the shader
1129 // masking is reset unless the user explicitly requests one.
1130 query->shaders = SI_PC_SHADERS_WINDOWING;
1131 }
1132
1133 if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
1134 group->se = sub_gid / block->num_instances;
1135 sub_gid = sub_gid % block->num_instances;
1136 } else {
1137 group->se = -1;
1138 }
1139
1140 if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
1141 group->instance = sub_gid;
1142 } else {
1143 group->instance = -1;
1144 }
1145
1146 group->next = query->groups;
1147 query->groups = group;
1148
1149 return group;
1150 }
1151
si_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)1152 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
1153 unsigned *query_types)
1154 {
1155 struct si_screen *screen = (struct si_screen *)ctx->screen;
1156 struct si_perfcounters *pc = screen->perfcounters;
1157 struct si_pc_block *block;
1158 struct si_query_group *group;
1159 struct si_query_pc *query;
1160 unsigned base_gid, sub_gid, sub_index;
1161 unsigned i, j;
1162
1163 if (!pc)
1164 return NULL;
1165
1166 query = CALLOC_STRUCT(si_query_pc);
1167 if (!query)
1168 return NULL;
1169
1170 query->b.ops = &batch_query_ops;
1171
1172 query->num_counters = num_queries;
1173
1174 /* Collect selectors per group */
1175 for (i = 0; i < num_queries; ++i) {
1176 unsigned sub_gid;
1177
1178 if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
1179 goto error;
1180
1181 block =
1182 lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
1183 if (!block)
1184 goto error;
1185
1186 sub_gid = sub_index / block->b->selectors;
1187 sub_index = sub_index % block->b->selectors;
1188
1189 group = get_group_state(screen, query, block, sub_gid);
1190 if (!group)
1191 goto error;
1192
1193 if (group->num_counters >= block->b->b->num_counters) {
1194 fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
1195 goto error;
1196 }
1197 group->selectors[group->num_counters] = sub_index;
1198 ++group->num_counters;
1199 }
1200
1201 /* Compute result bases and CS size per group */
1202 query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
1203 query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
1204
1205 i = 0;
1206 for (group = query->groups; group; group = group->next) {
1207 struct si_pc_block *block = group->block;
1208 unsigned read_dw;
1209 unsigned instances = 1;
1210
1211 if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1212 instances = screen->info.max_se;
1213 if (group->instance < 0)
1214 instances *= block->num_instances;
1215
1216 group->result_base = i;
1217 query->result_size += sizeof(uint64_t) * instances * group->num_counters;
1218 i += instances * group->num_counters;
1219
1220 read_dw = 6 * group->num_counters;
1221 query->b.num_cs_dw_suspend += instances * read_dw;
1222 query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
1223 }
1224
1225 if (query->shaders) {
1226 if (query->shaders == SI_PC_SHADERS_WINDOWING)
1227 query->shaders = 0xffffffff;
1228 }
1229
1230 /* Map user-supplied query array to result indices */
1231 query->counters = CALLOC(num_queries, sizeof(*query->counters));
1232 for (i = 0; i < num_queries; ++i) {
1233 struct si_query_counter *counter = &query->counters[i];
1234 struct si_pc_block *block;
1235
1236 block =
1237 lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
1238
1239 sub_gid = sub_index / block->b->selectors;
1240 sub_index = sub_index % block->b->selectors;
1241
1242 group = get_group_state(screen, query, block, sub_gid);
1243 assert(group != NULL);
1244
1245 for (j = 0; j < group->num_counters; ++j) {
1246 if (group->selectors[j] == sub_index)
1247 break;
1248 }
1249
1250 counter->base = group->result_base + j;
1251 counter->stride = group->num_counters;
1252
1253 counter->qwords = 1;
1254 if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1255 counter->qwords = screen->info.max_se;
1256 if (group->instance < 0)
1257 counter->qwords *= block->num_instances;
1258 }
1259
1260 return (struct pipe_query *)query;
1261
1262 error:
1263 si_pc_query_destroy((struct si_context *)ctx, &query->b);
1264 return NULL;
1265 }
1266
si_init_block_names(struct si_screen * screen,struct si_pc_block * block)1267 static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block)
1268 {
1269 bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
1270 bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
1271 unsigned i, j, k;
1272 unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
1273 unsigned namelen;
1274 char *groupname;
1275 char *p;
1276
1277 if (per_instance_groups)
1278 groups_instance = block->num_instances;
1279 if (per_se_groups)
1280 groups_se = screen->info.max_se;
1281 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1282 groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
1283
1284 namelen = strlen(block->b->b->name);
1285 block->group_name_stride = namelen + 1;
1286 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1287 block->group_name_stride += 3;
1288 if (per_se_groups) {
1289 assert(groups_se <= 10);
1290 block->group_name_stride += 1;
1291
1292 if (per_instance_groups)
1293 block->group_name_stride += 1;
1294 }
1295 if (per_instance_groups) {
1296 assert(groups_instance <= 100);
1297 block->group_name_stride += 2;
1298 }
1299
1300 block->group_names = MALLOC(block->num_groups * block->group_name_stride);
1301 if (!block->group_names)
1302 return false;
1303
1304 groupname = block->group_names;
1305 for (i = 0; i < groups_shader; ++i) {
1306 const char *shader_suffix = si_pc_shader_type_suffixes[i];
1307 unsigned shaderlen = strlen(shader_suffix);
1308 for (j = 0; j < groups_se; ++j) {
1309 for (k = 0; k < groups_instance; ++k) {
1310 strcpy(groupname, block->b->b->name);
1311 p = groupname + namelen;
1312
1313 if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
1314 strcpy(p, shader_suffix);
1315 p += shaderlen;
1316 }
1317
1318 if (per_se_groups) {
1319 p += sprintf(p, "%d", j);
1320 if (per_instance_groups)
1321 *p++ = '_';
1322 }
1323
1324 if (per_instance_groups)
1325 p += sprintf(p, "%d", k);
1326
1327 groupname += block->group_name_stride;
1328 }
1329 }
1330 }
1331
1332 assert(block->b->selectors <= 1000);
1333 block->selector_name_stride = block->group_name_stride + 4;
1334 block->selector_names =
1335 MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride);
1336 if (!block->selector_names)
1337 return false;
1338
1339 groupname = block->group_names;
1340 p = block->selector_names;
1341 for (i = 0; i < block->num_groups; ++i) {
1342 for (j = 0; j < block->b->selectors; ++j) {
1343 sprintf(p, "%s_%03d", groupname, j);
1344 p += block->selector_name_stride;
1345 }
1346 groupname += block->group_name_stride;
1347 }
1348
1349 return true;
1350 }
1351
si_get_perfcounter_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_info * info)1352 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
1353 struct pipe_driver_query_info *info)
1354 {
1355 struct si_perfcounters *pc = screen->perfcounters;
1356 struct si_pc_block *block;
1357 unsigned base_gid, sub;
1358
1359 if (!pc)
1360 return 0;
1361
1362 if (!info) {
1363 unsigned bid, num_queries = 0;
1364
1365 for (bid = 0; bid < pc->num_blocks; ++bid) {
1366 num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups;
1367 }
1368
1369 return num_queries;
1370 }
1371
1372 block = lookup_counter(pc, index, &base_gid, &sub);
1373 if (!block)
1374 return 0;
1375
1376 if (!block->selector_names) {
1377 if (!si_init_block_names(screen, block))
1378 return 0;
1379 }
1380 info->name = block->selector_names + sub * block->selector_name_stride;
1381 info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
1382 info->max_value.u64 = 0;
1383 info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
1384 info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
1385 info->group_id = base_gid + sub / block->b->selectors;
1386 info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
1387 if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
1388 info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
1389 return 1;
1390 }
1391
si_get_perfcounter_group_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1392 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
1393 struct pipe_driver_query_group_info *info)
1394 {
1395 struct si_perfcounters *pc = screen->perfcounters;
1396 struct si_pc_block *block;
1397
1398 if (!pc)
1399 return 0;
1400
1401 if (!info)
1402 return pc->num_groups;
1403
1404 block = lookup_group(pc, &index);
1405 if (!block)
1406 return 0;
1407
1408 if (!block->group_names) {
1409 if (!si_init_block_names(screen, block))
1410 return 0;
1411 }
1412 info->name = block->group_names + index * block->group_name_stride;
1413 info->num_queries = block->b->selectors;
1414 info->max_active_queries = block->b->b->num_counters;
1415 return 1;
1416 }
1417
si_destroy_perfcounters(struct si_screen * screen)1418 void si_destroy_perfcounters(struct si_screen *screen)
1419 {
1420 struct si_perfcounters *pc = screen->perfcounters;
1421 unsigned i;
1422
1423 if (!pc)
1424 return;
1425
1426 for (i = 0; i < pc->num_blocks; ++i) {
1427 FREE(pc->blocks[i].group_names);
1428 FREE(pc->blocks[i].selector_names);
1429 }
1430 FREE(pc->blocks);
1431 FREE(pc);
1432 screen->perfcounters = NULL;
1433 }
1434
si_init_perfcounters(struct si_screen * screen)1435 void si_init_perfcounters(struct si_screen *screen)
1436 {
1437 struct si_perfcounters *pc;
1438 const struct si_pc_block_gfxdescr *blocks;
1439 unsigned num_blocks;
1440 unsigned i;
1441
1442 switch (screen->info.chip_class) {
1443 case GFX7:
1444 blocks = groups_CIK;
1445 num_blocks = ARRAY_SIZE(groups_CIK);
1446 break;
1447 case GFX8:
1448 blocks = groups_VI;
1449 num_blocks = ARRAY_SIZE(groups_VI);
1450 break;
1451 case GFX9:
1452 blocks = groups_gfx9;
1453 num_blocks = ARRAY_SIZE(groups_gfx9);
1454 break;
1455 case GFX10:
1456 case GFX10_3:
1457 blocks = groups_gfx10;
1458 num_blocks = ARRAY_SIZE(groups_gfx10);
1459 break;
1460 case GFX6:
1461 default:
1462 return; /* not implemented */
1463 }
1464
1465 screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
1466 if (!pc)
1467 return;
1468
1469 pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
1470 pc->num_instance_cs_dwords = 3;
1471
1472 pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1473 pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1474
1475 pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
1476 if (!pc->blocks)
1477 goto error;
1478 pc->num_blocks = num_blocks;
1479
1480 for (i = 0; i < num_blocks; ++i) {
1481 struct si_pc_block *block = &pc->blocks[i];
1482 block->b = &blocks[i];
1483 block->num_instances = MAX2(1, block->b->instances);
1484
1485 if (!strcmp(block->b->b->name, "CB") ||
1486 !strcmp(block->b->b->name, "DB") ||
1487 !strcmp(block->b->b->name, "RMI"))
1488 block->num_instances = screen->info.max_se;
1489 else if (!strcmp(block->b->b->name, "TCC"))
1490 block->num_instances = screen->info.num_tcc_blocks;
1491 else if (!strcmp(block->b->b->name, "IA"))
1492 block->num_instances = MAX2(1, screen->info.max_se / 2);
1493 else if (!strcmp(block->b->b->name, "TA") ||
1494 !strcmp(block->b->b->name, "TCP") ||
1495 !strcmp(block->b->b->name, "TD")) {
1496 block->num_instances = MAX2(1, screen->info.max_good_cu_per_sa);
1497 }
1498
1499 if (si_pc_block_has_per_instance_groups(pc, block)) {
1500 block->num_groups = block->num_instances;
1501 } else {
1502 block->num_groups = 1;
1503 }
1504
1505 if (si_pc_block_has_per_se_groups(pc, block))
1506 block->num_groups *= screen->info.max_se;
1507 if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1508 block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
1509
1510 pc->num_groups += block->num_groups;
1511 }
1512
1513 return;
1514
1515 error:
1516 si_destroy_perfcounters(screen);
1517 }
1518