• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "util/u_memory.h"
28 
29 enum si_pc_block_flags
30 {
31    /* This block is part of the shader engine */
32    SI_PC_BLOCK_SE = (1 << 0),
33 
34    /* Expose per-instance groups instead of summing all instances (within
35     * an SE). */
36    SI_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
37 
38    /* Expose per-SE groups instead of summing instances across SEs. */
39    SI_PC_BLOCK_SE_GROUPS = (1 << 2),
40 
41    /* Shader block */
42    SI_PC_BLOCK_SHADER = (1 << 3),
43 
44    /* Non-shader block with perfcounters windowed by shaders. */
45    SI_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
46 };
47 
48 enum si_pc_reg_layout
49 {
50    /* All secondary selector dwords follow as one block after the primary
51     * selector dwords for the counters that have secondary selectors.
52     *
53     * Example:
54     *    PERFCOUNTER0_SELECT
55     *    PERFCOUNTER1_SELECT
56     *    PERFCOUNTER0_SELECT1
57     *    PERFCOUNTER1_SELECT1
58     *    PERFCOUNTER2_SELECT
59     *    PERFCOUNTER3_SELECT
60     */
61    SI_PC_MULTI_BLOCK = 0,
62 
63    /* Each secondary selector dword follows immediately after the
64     * corresponding primary.
65     *
66     * Example:
67     *    PERFCOUNTER0_SELECT
68     *    PERFCOUNTER0_SELECT1
69     *    PERFCOUNTER1_SELECT
70     *    PERFCOUNTER1_SELECT1
71     *    PERFCOUNTER2_SELECT
72     *    PERFCOUNTER3_SELECT
73     */
74    SI_PC_MULTI_ALTERNATE = 1,
75 
76    /* All secondary selector dwords follow as one block after all primary
77     * selector dwords.
78     *
79     * Example:
80     *    PERFCOUNTER0_SELECT
81     *    PERFCOUNTER1_SELECT
82     *    PERFCOUNTER2_SELECT
83     *    PERFCOUNTER3_SELECT
84     *    PERFCOUNTER0_SELECT1
85     *    PERFCOUNTER1_SELECT1
86     */
87    SI_PC_MULTI_TAIL = 2,
88 
89    /* Free-form arrangement of selector registers. */
90    SI_PC_MULTI_CUSTOM = 3,
91 
92    SI_PC_MULTI_MASK = 3,
93 
94    /* Registers are laid out in decreasing rather than increasing order. */
95    SI_PC_REG_REVERSE = 4,
96 
97    SI_PC_FAKE = 8,
98 };
99 
100 struct si_pc_block_base {
101    const char *name;
102    unsigned num_counters;
103    unsigned flags;
104 
105    unsigned select_or;
106    unsigned select0;
107    unsigned counter0_lo;
108    unsigned *select;
109    unsigned *counters;
110    unsigned num_multi;
111    unsigned num_prelude;
112    unsigned layout;
113 };
114 
115 struct si_pc_block_gfxdescr {
116    struct si_pc_block_base *b;
117    unsigned selectors;
118    unsigned instances;
119 };
120 
121 struct si_pc_block {
122    const struct si_pc_block_gfxdescr *b;
123    unsigned num_instances;
124 
125    unsigned num_groups;
126    char *group_names;
127    unsigned group_name_stride;
128 
129    char *selector_names;
130    unsigned selector_name_stride;
131 };
132 
133 /* The order is chosen to be compatible with GPUPerfStudio's hardcoding of
134  * performance counter group IDs.
135  */
136 static const char *const si_pc_shader_type_suffixes[] = {"",    "_ES", "_GS", "_VS",
137                                                          "_PS", "_LS", "_HS", "_CS"};
138 
139 static const unsigned si_pc_shader_type_bits[] = {
140    0x7f,
141    S_036780_ES_EN(1),
142    S_036780_GS_EN(1),
143    S_036780_VS_EN(1),
144    S_036780_PS_EN(1),
145    S_036780_LS_EN(1),
146    S_036780_HS_EN(1),
147    S_036780_CS_EN(1),
148 };
149 
150 /* Max counters per HW block */
151 #define SI_QUERY_MAX_COUNTERS 16
152 
153 #define SI_PC_SHADERS_WINDOWING (1u << 31)
154 
155 struct si_query_group {
156    struct si_query_group *next;
157    struct si_pc_block *block;
158    unsigned sub_gid;     /* only used during init */
159    unsigned result_base; /* only used during init */
160    int se;
161    int instance;
162    unsigned num_counters;
163    unsigned selectors[SI_QUERY_MAX_COUNTERS];
164 };
165 
166 struct si_query_counter {
167    unsigned base;
168    unsigned qwords;
169    unsigned stride; /* in uint64s */
170 };
171 
172 struct si_query_pc {
173    struct si_query b;
174    struct si_query_buffer buffer;
175 
176    /* Size of the results in memory, in bytes. */
177    unsigned result_size;
178 
179    unsigned shaders;
180    unsigned num_counters;
181    struct si_query_counter *counters;
182    struct si_query_group *groups;
183 };
184 
185 static struct si_pc_block_base cik_CB = {
186    .name = "CB",
187    .num_counters = 4,
188    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
189 
190    .select0 = R_037000_CB_PERFCOUNTER_FILTER,
191    .counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
192    .num_multi = 1,
193    .num_prelude = 1,
194    .layout = SI_PC_MULTI_ALTERNATE,
195 };
196 
197 static unsigned cik_CPC_select[] = {
198    R_036024_CPC_PERFCOUNTER0_SELECT,
199    R_036010_CPC_PERFCOUNTER0_SELECT1,
200    R_03600C_CPC_PERFCOUNTER1_SELECT,
201 };
202 static struct si_pc_block_base cik_CPC = {
203    .name = "CPC",
204    .num_counters = 2,
205 
206    .select = cik_CPC_select,
207    .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
208    .num_multi = 1,
209    .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
210 };
211 
212 static struct si_pc_block_base cik_CPF = {
213    .name = "CPF",
214    .num_counters = 2,
215 
216    .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
217    .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
218    .num_multi = 1,
219    .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
220 };
221 
222 static struct si_pc_block_base cik_CPG = {
223    .name = "CPG",
224    .num_counters = 2,
225 
226    .select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
227    .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
228    .num_multi = 1,
229    .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
230 };
231 
232 static struct si_pc_block_base cik_DB = {
233    .name = "DB",
234    .num_counters = 4,
235    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
236 
237    .select0 = R_037100_DB_PERFCOUNTER0_SELECT,
238    .counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
239    .num_multi = 3, // really only 2, but there's a gap between registers
240    .layout = SI_PC_MULTI_ALTERNATE,
241 };
242 
243 static struct si_pc_block_base cik_GDS = {
244    .name = "GDS",
245    .num_counters = 4,
246 
247    .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
248    .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
249    .num_multi = 1,
250    .layout = SI_PC_MULTI_TAIL,
251 };
252 
253 static unsigned cik_GRBM_counters[] = {
254    R_034100_GRBM_PERFCOUNTER0_LO,
255    R_03410C_GRBM_PERFCOUNTER1_LO,
256 };
257 static struct si_pc_block_base cik_GRBM = {
258    .name = "GRBM",
259    .num_counters = 2,
260 
261    .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
262    .counters = cik_GRBM_counters,
263 };
264 
265 static struct si_pc_block_base cik_GRBMSE = {
266    .name = "GRBMSE",
267    .num_counters = 4,
268 
269    .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
270    .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
271 };
272 
273 static struct si_pc_block_base cik_IA = {
274    .name = "IA",
275    .num_counters = 4,
276 
277    .select0 = R_036210_IA_PERFCOUNTER0_SELECT,
278    .counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
279    .num_multi = 1,
280    .layout = SI_PC_MULTI_TAIL,
281 };
282 
283 static struct si_pc_block_base cik_PA_SC = {
284    .name = "PA_SC",
285    .num_counters = 8,
286    .flags = SI_PC_BLOCK_SE,
287 
288    .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
289    .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
290    .num_multi = 1,
291    .layout = SI_PC_MULTI_ALTERNATE,
292 };
293 
294 /* According to docs, PA_SU counters are only 48 bits wide. */
295 static struct si_pc_block_base cik_PA_SU = {
296    .name = "PA_SU",
297    .num_counters = 4,
298    .flags = SI_PC_BLOCK_SE,
299 
300    .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
301    .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
302    .num_multi = 2,
303    .layout = SI_PC_MULTI_ALTERNATE,
304 };
305 
306 static struct si_pc_block_base cik_SPI = {
307    .name = "SPI",
308    .num_counters = 6,
309    .flags = SI_PC_BLOCK_SE,
310 
311    .select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
312    .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
313    .num_multi = 4,
314    .layout = SI_PC_MULTI_BLOCK,
315 };
316 
317 static struct si_pc_block_base cik_SQ = {
318    .name = "SQ",
319    .num_counters = 16,
320    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER,
321 
322    .select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
323    .select_or = S_036700_SQC_BANK_MASK(15) | S_036700_SQC_CLIENT_MASK(15) | S_036700_SIMD_MASK(15),
324    .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
325 };
326 
327 static struct si_pc_block_base cik_SX = {
328    .name = "SX",
329    .num_counters = 4,
330    .flags = SI_PC_BLOCK_SE,
331 
332    .select0 = R_036900_SX_PERFCOUNTER0_SELECT,
333    .counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
334    .num_multi = 2,
335    .layout = SI_PC_MULTI_TAIL,
336 };
337 
338 static struct si_pc_block_base cik_TA = {
339    .name = "TA",
340    .num_counters = 2,
341    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
342 
343    .select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
344    .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
345    .num_multi = 1,
346    .layout = SI_PC_MULTI_ALTERNATE,
347 };
348 
349 static struct si_pc_block_base cik_TD = {
350    .name = "TD",
351    .num_counters = 2,
352    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
353 
354    .select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
355    .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
356    .num_multi = 1,
357    .layout = SI_PC_MULTI_ALTERNATE,
358 };
359 
360 static struct si_pc_block_base cik_TCA = {
361    .name = "TCA",
362    .num_counters = 4,
363    .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
364 
365    .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
366    .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
367    .num_multi = 2,
368    .layout = SI_PC_MULTI_ALTERNATE,
369 };
370 
371 static struct si_pc_block_base cik_TCC = {
372    .name = "TCC",
373    .num_counters = 4,
374    .flags = SI_PC_BLOCK_INSTANCE_GROUPS,
375 
376    .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
377    .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
378    .num_multi = 2,
379    .layout = SI_PC_MULTI_ALTERNATE,
380 };
381 
382 static struct si_pc_block_base cik_TCP = {
383    .name = "TCP",
384    .num_counters = 4,
385    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED,
386 
387    .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
388    .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
389    .num_multi = 2,
390    .layout = SI_PC_MULTI_ALTERNATE,
391 };
392 
393 static struct si_pc_block_base cik_VGT = {
394    .name = "VGT",
395    .num_counters = 4,
396    .flags = SI_PC_BLOCK_SE,
397 
398    .select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
399    .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
400    .num_multi = 1,
401    .layout = SI_PC_MULTI_TAIL,
402 };
403 
404 static struct si_pc_block_base cik_WD = {
405    .name = "WD",
406    .num_counters = 4,
407 
408    .select0 = R_036200_WD_PERFCOUNTER0_SELECT,
409    .counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
410 };
411 
412 static struct si_pc_block_base cik_MC = {
413    .name = "MC",
414    .num_counters = 4,
415 
416    .layout = SI_PC_FAKE,
417 };
418 
419 static struct si_pc_block_base cik_SRBM = {
420    .name = "SRBM",
421    .num_counters = 2,
422 
423    .layout = SI_PC_FAKE,
424 };
425 
426 static struct si_pc_block_base gfx10_CHA = {
427    .name = "CHA",
428    .num_counters = 4,
429 
430    .select0 = R_037780_CHA_PERFCOUNTER0_SELECT,
431    .counter0_lo = R_035800_CHA_PERFCOUNTER0_LO,
432    .num_multi = 1,
433    .layout = SI_PC_MULTI_ALTERNATE,
434 };
435 
436 static struct si_pc_block_base gfx10_CHCG = {
437    .name = "CHCG",
438    .num_counters = 4,
439 
440    .select0 = R_036F18_CHCG_PERFCOUNTER0_SELECT,
441    .counter0_lo = R_034F20_CHCG_PERFCOUNTER0_LO,
442    .num_multi = 1,
443    .layout = SI_PC_MULTI_ALTERNATE,
444 };
445 
446 static struct si_pc_block_base gfx10_CHC = {
447    .name = "CHC",
448    .num_counters = 4,
449 
450    .select0 = R_036F00_CHC_PERFCOUNTER0_SELECT,
451    .counter0_lo = R_034F00_CHC_PERFCOUNTER0_LO,
452    .num_multi = 1,
453    .layout = SI_PC_MULTI_ALTERNATE,
454 };
455 
456 static struct si_pc_block_base gfx10_GCR = {
457    .name = "GCR",
458    .num_counters = 2,
459 
460    .select0 = R_037580_GCR_PERFCOUNTER0_SELECT,
461    .counter0_lo = R_035480_GCR_PERFCOUNTER0_LO,
462    .num_multi = 1,
463    .layout = SI_PC_MULTI_ALTERNATE,
464 };
465 
466 static struct si_pc_block_base gfx10_GE = {
467    .name = "GE",
468    .num_counters = 12,
469 
470    .select0 = R_036200_GE_PERFCOUNTER0_SELECT,
471    .counter0_lo = R_034200_GE_PERFCOUNTER0_LO,
472    .num_multi = 4,
473    .layout = SI_PC_MULTI_ALTERNATE,
474 };
475 
476 static struct si_pc_block_base gfx10_GL1A = {
477    .name = "GL1A",
478    .num_counters = 4,
479    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
480 
481    .select0 = R_037700_GL1A_PERFCOUNTER0_SELECT,
482    .counter0_lo = R_035700_GL1A_PERFCOUNTER0_LO,
483    .num_multi = 1,
484    .layout = SI_PC_MULTI_ALTERNATE,
485 };
486 
487 static struct si_pc_block_base gfx10_GL1C = {
488    .name = "GL1C",
489    .num_counters = 4,
490    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
491 
492    .select0 = R_036E80_GL1C_PERFCOUNTER0_SELECT,
493    .counter0_lo = R_034E80_GL1C_PERFCOUNTER0_LO,
494    .num_multi = 1,
495    .layout = SI_PC_MULTI_ALTERNATE,
496 };
497 
498 static struct si_pc_block_base gfx10_GL2A = {
499    .name = "GL2A",
500    .num_counters = 4,
501 
502    .select0 = R_036E40_GL2A_PERFCOUNTER0_SELECT,
503    .counter0_lo = R_034E40_GL2A_PERFCOUNTER0_LO,
504    .num_multi = 2,
505    .layout = SI_PC_MULTI_ALTERNATE,
506 };
507 
508 static struct si_pc_block_base gfx10_GL2C = {
509    .name = "GL2C",
510    .num_counters = 4,
511 
512    .select0 = R_036E00_GL2C_PERFCOUNTER0_SELECT,
513    .counter0_lo = R_034E00_GL2C_PERFCOUNTER0_LO,
514    .num_multi = 2,
515    .layout = SI_PC_MULTI_ALTERNATE,
516 };
517 
518 static unsigned gfx10_PA_PH_select[] = {
519    R_037600_PA_PH_PERFCOUNTER0_SELECT,
520    R_037604_PA_PH_PERFCOUNTER0_SELECT1,
521    R_037608_PA_PH_PERFCOUNTER1_SELECT,
522    R_037640_PA_PH_PERFCOUNTER1_SELECT1,
523    R_03760C_PA_PH_PERFCOUNTER2_SELECT,
524    R_037644_PA_PH_PERFCOUNTER2_SELECT1,
525    R_037610_PA_PH_PERFCOUNTER3_SELECT,
526    R_037648_PA_PH_PERFCOUNTER3_SELECT1,
527    R_037614_PA_PH_PERFCOUNTER4_SELECT,
528    R_037618_PA_PH_PERFCOUNTER5_SELECT,
529    R_03761C_PA_PH_PERFCOUNTER6_SELECT,
530    R_037620_PA_PH_PERFCOUNTER7_SELECT,
531 };
532 static struct si_pc_block_base gfx10_PA_PH = {
533    .name = "PA_PH",
534    .num_counters = 8,
535    .flags = SI_PC_BLOCK_SE,
536 
537    .select = gfx10_PA_PH_select,
538    .counter0_lo = R_035600_PA_PH_PERFCOUNTER0_LO,
539    .num_multi = 4,
540    .layout = SI_PC_MULTI_CUSTOM,
541 };
542 
543 static struct si_pc_block_base gfx10_PA_SU = {
544    .name = "PA_SU",
545    .num_counters = 4,
546    .flags = SI_PC_BLOCK_SE,
547 
548    .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
549    .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
550    .num_multi = 4,
551    .layout = SI_PC_MULTI_ALTERNATE,
552 };
553 
554 static struct si_pc_block_base gfx10_RLC = {
555    .name = "RLC",
556    .num_counters = 2,
557 
558    .select0 = R_037304_RLC_PERFCOUNTER0_SELECT,
559    .counter0_lo = R_035200_RLC_PERFCOUNTER0_LO,
560    .num_multi = 0,
561    .layout = SI_PC_MULTI_ALTERNATE,
562 };
563 
564 static struct si_pc_block_base gfx10_RMI = {
565    .name = "RMI",
566    /* Actually 4, but the 2nd counter is missing the secondary selector while
567     * the 3rd counter has it, which complicates the register layout. */
568    .num_counters = 2,
569    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS,
570 
571    .select0 = R_037400_RMI_PERFCOUNTER0_SELECT,
572    .counter0_lo = R_035300_RMI_PERFCOUNTER0_LO,
573    .num_multi = 1,
574    .layout = SI_PC_MULTI_ALTERNATE,
575 };
576 
577 static struct si_pc_block_base gfx10_UTCL1 = {
578    .name = "UTCL1",
579    .num_counters = 2,
580    .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER_WINDOWED,
581 
582    .select0 = R_03758C_UTCL1_PERFCOUNTER0_SELECT,
583    .counter0_lo = R_035470_UTCL1_PERFCOUNTER0_LO,
584    .num_multi = 0,
585    .layout = SI_PC_MULTI_ALTERNATE,
586 };
587 
588 /* Both the number of instances and selectors varies between chips of the same
589  * class. We only differentiate by class here and simply expose the maximum
590  * number over all chips in a class.
591  *
592  * Unfortunately, GPUPerfStudio uses the order of performance counter groups
593  * blindly once it believes it has identified the hardware, so the order of
594  * blocks here matters.
595  */
596 static struct si_pc_block_gfxdescr groups_CIK[] = {
597    {&cik_CB, 226},     {&cik_CPF, 17},    {&cik_DB, 257},  {&cik_GRBM, 34},   {&cik_GRBMSE, 15},
598    {&cik_PA_SU, 153},  {&cik_PA_SC, 395}, {&cik_SPI, 186}, {&cik_SQ, 252},    {&cik_SX, 32},
599    {&cik_TA, 111},     {&cik_TCA, 39, 2}, {&cik_TCC, 160}, {&cik_TD, 55},     {&cik_TCP, 154},
600    {&cik_GDS, 121},    {&cik_VGT, 140},   {&cik_IA, 22},   {&cik_MC, 22},     {&cik_SRBM, 19},
601    {&cik_WD, 22},      {&cik_CPG, 46},    {&cik_CPC, 22},
602 
603 };
604 
605 static struct si_pc_block_gfxdescr groups_VI[] = {
606    {&cik_CB, 405},     {&cik_CPF, 19},    {&cik_DB, 257},  {&cik_GRBM, 34},   {&cik_GRBMSE, 15},
607    {&cik_PA_SU, 154},  {&cik_PA_SC, 397}, {&cik_SPI, 197}, {&cik_SQ, 273},    {&cik_SX, 34},
608    {&cik_TA, 119},     {&cik_TCA, 35, 2}, {&cik_TCC, 192}, {&cik_TD, 55},     {&cik_TCP, 180},
609    {&cik_GDS, 121},    {&cik_VGT, 147},   {&cik_IA, 24},   {&cik_MC, 22},     {&cik_SRBM, 27},
610    {&cik_WD, 37},      {&cik_CPG, 48},    {&cik_CPC, 24},
611 
612 };
613 
614 static struct si_pc_block_gfxdescr groups_gfx9[] = {
615    {&cik_CB, 438},     {&cik_CPF, 32},    {&cik_DB, 328},  {&cik_GRBM, 38},   {&cik_GRBMSE, 16},
616    {&cik_PA_SU, 292},  {&cik_PA_SC, 491}, {&cik_SPI, 196}, {&cik_SQ, 374},    {&cik_SX, 208},
617    {&cik_TA, 119},     {&cik_TCA, 35, 2}, {&cik_TCC, 256}, {&cik_TD, 57},     {&cik_TCP, 85},
618    {&cik_GDS, 121},    {&cik_VGT, 148},   {&cik_IA, 32},   {&cik_WD, 58},     {&cik_CPG, 59},
619    {&cik_CPC, 35},
620 };
621 
622 static struct si_pc_block_gfxdescr groups_gfx10[] = {
623    {&cik_CB, 461},
624    {&gfx10_CHA, 45},
625    {&gfx10_CHCG, 35},
626    {&gfx10_CHC, 35},
627    {&cik_CPC, 47},
628    {&cik_CPF, 40},
629    {&cik_CPG, 82},
630    {&cik_DB, 370},
631    {&gfx10_GCR, 94},
632    {&cik_GDS, 123},
633    {&gfx10_GE, 315},
634    {&gfx10_GL1A, 36},
635    {&gfx10_GL1C, 64},
636    {&gfx10_GL2A, 91},
637    {&gfx10_GL2C, 235},
638    {&cik_GRBM, 47},
639    {&cik_GRBMSE, 19},
640    {&gfx10_PA_PH, 960},
641    {&cik_PA_SC, 552},
642    {&gfx10_PA_SU, 266},
643    {&gfx10_RLC, 7},
644    {&gfx10_RMI, 258},
645    {&cik_SPI, 329},
646    {&cik_SQ, 509},
647    {&cik_SX, 225},
648    {&cik_TA, 226},
649    {&cik_TCP, 77},
650    {&cik_TD, 61},
651    {&gfx10_UTCL1, 15},
652 };
653 
si_pc_block_has_per_se_groups(const struct si_perfcounters * pc,const struct si_pc_block * block)654 static bool si_pc_block_has_per_se_groups(const struct si_perfcounters *pc,
655                                           const struct si_pc_block *block)
656 {
657    return block->b->b->flags & SI_PC_BLOCK_SE_GROUPS ||
658           (block->b->b->flags & SI_PC_BLOCK_SE && pc->separate_se);
659 }
660 
si_pc_block_has_per_instance_groups(const struct si_perfcounters * pc,const struct si_pc_block * block)661 static bool si_pc_block_has_per_instance_groups(const struct si_perfcounters *pc,
662                                                 const struct si_pc_block *block)
663 {
664    return block->b->b->flags & SI_PC_BLOCK_INSTANCE_GROUPS ||
665           (block->num_instances > 1 && pc->separate_instance);
666 }
667 
lookup_counter(struct si_perfcounters * pc,unsigned index,unsigned * base_gid,unsigned * sub_index)668 static struct si_pc_block *lookup_counter(struct si_perfcounters *pc, unsigned index,
669                                           unsigned *base_gid, unsigned *sub_index)
670 {
671    struct si_pc_block *block = pc->blocks;
672    unsigned bid;
673 
674    *base_gid = 0;
675    for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
676       unsigned total = block->num_groups * block->b->selectors;
677 
678       if (index < total) {
679          *sub_index = index;
680          return block;
681       }
682 
683       index -= total;
684       *base_gid += block->num_groups;
685    }
686 
687    return NULL;
688 }
689 
lookup_group(struct si_perfcounters * pc,unsigned * index)690 static struct si_pc_block *lookup_group(struct si_perfcounters *pc, unsigned *index)
691 {
692    unsigned bid;
693    struct si_pc_block *block = pc->blocks;
694 
695    for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
696       if (*index < block->num_groups)
697          return block;
698       *index -= block->num_groups;
699    }
700 
701    return NULL;
702 }
703 
si_pc_emit_instance(struct si_context * sctx,int se,int instance)704 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
705 {
706    struct radeon_cmdbuf *cs = sctx->gfx_cs;
707    unsigned value = S_030800_SH_BROADCAST_WRITES(1);
708 
709    if (se >= 0) {
710       value |= S_030800_SE_INDEX(se);
711    } else {
712       value |= S_030800_SE_BROADCAST_WRITES(1);
713    }
714 
715    if (sctx->chip_class >= GFX10) {
716       /* TODO: Expose counters from each shader array separately if needed. */
717       value |= S_030800_SA_BROADCAST_WRITES(1);
718    }
719 
720    if (instance >= 0) {
721       value |= S_030800_INSTANCE_INDEX(instance);
722    } else {
723       value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
724    }
725 
726    radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
727 }
728 
si_pc_emit_shaders(struct si_context * sctx,unsigned shaders)729 static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
730 {
731    struct radeon_cmdbuf *cs = sctx->gfx_cs;
732 
733    radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
734    radeon_emit(cs, shaders & 0x7f);
735    radeon_emit(cs, 0xffffffff);
736 }
737 
si_pc_emit_select(struct si_context * sctx,struct si_pc_block * block,unsigned count,unsigned * selectors)738 static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count,
739                               unsigned *selectors)
740 {
741    struct si_pc_block_base *regs = block->b->b;
742    struct radeon_cmdbuf *cs = sctx->gfx_cs;
743    unsigned idx;
744    unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
745    unsigned dw;
746 
747    assert(count <= regs->num_counters);
748 
749    if (regs->layout & SI_PC_FAKE)
750       return;
751 
752    if (layout_multi == SI_PC_MULTI_BLOCK) {
753       assert(!(regs->layout & SI_PC_REG_REVERSE));
754 
755       dw = count + regs->num_prelude;
756       if (count >= regs->num_multi)
757          dw += regs->num_multi;
758       radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
759       for (idx = 0; idx < regs->num_prelude; ++idx)
760          radeon_emit(cs, 0);
761       for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
762          radeon_emit(cs, selectors[idx] | regs->select_or);
763 
764       if (count < regs->num_multi) {
765          unsigned select1 = regs->select0 + 4 * regs->num_multi;
766          radeon_set_uconfig_reg_seq(cs, select1, count);
767       }
768 
769       for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
770          radeon_emit(cs, 0);
771 
772       if (count > regs->num_multi) {
773          for (idx = regs->num_multi; idx < count; ++idx)
774             radeon_emit(cs, selectors[idx] | regs->select_or);
775       }
776    } else if (layout_multi == SI_PC_MULTI_TAIL) {
777       unsigned select1, select1_count;
778 
779       assert(!(regs->layout & SI_PC_REG_REVERSE));
780 
781       radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
782       for (idx = 0; idx < regs->num_prelude; ++idx)
783          radeon_emit(cs, 0);
784       for (idx = 0; idx < count; ++idx)
785          radeon_emit(cs, selectors[idx] | regs->select_or);
786 
787       select1 = regs->select0 + 4 * regs->num_counters;
788       select1_count = MIN2(count, regs->num_multi);
789       radeon_set_uconfig_reg_seq(cs, select1, select1_count);
790       for (idx = 0; idx < select1_count; ++idx)
791          radeon_emit(cs, 0);
792    } else if (layout_multi == SI_PC_MULTI_CUSTOM) {
793       unsigned *reg = regs->select;
794       for (idx = 0; idx < count; ++idx) {
795          radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
796          if (idx < regs->num_multi)
797             radeon_set_uconfig_reg(cs, *reg++, 0);
798       }
799    } else {
800       assert(layout_multi == SI_PC_MULTI_ALTERNATE);
801 
802       unsigned reg_base = regs->select0;
803       unsigned reg_count = count + MIN2(count, regs->num_multi);
804       reg_count += regs->num_prelude;
805 
806       if (!(regs->layout & SI_PC_REG_REVERSE)) {
807          radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
808 
809          for (idx = 0; idx < regs->num_prelude; ++idx)
810             radeon_emit(cs, 0);
811          for (idx = 0; idx < count; ++idx) {
812             radeon_emit(cs, selectors[idx] | regs->select_or);
813             if (idx < regs->num_multi)
814                radeon_emit(cs, 0);
815          }
816       } else {
817          reg_base -= (reg_count - 1) * 4;
818          radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
819 
820          for (idx = count; idx > 0; --idx) {
821             if (idx <= regs->num_multi)
822                radeon_emit(cs, 0);
823             radeon_emit(cs, selectors[idx - 1] | regs->select_or);
824          }
825          for (idx = 0; idx < regs->num_prelude; ++idx)
826             radeon_emit(cs, 0);
827       }
828    }
829 }
830 
si_pc_emit_start(struct si_context * sctx,struct si_resource * buffer,uint64_t va)831 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
832 {
833    struct radeon_cmdbuf *cs = sctx->gfx_cs;
834 
835    si_cp_copy_data(sctx, sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
836                    COPY_DATA_IMM, NULL, 1);
837 
838    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
839                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
840    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
841    radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
842    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
843                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
844 }
845 
846 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
847  * do it again in here. */
si_pc_emit_stop(struct si_context * sctx,struct si_resource * buffer,uint64_t va)848 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
849 {
850    struct radeon_cmdbuf *cs = sctx->gfx_cs;
851 
852    si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
853                      EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
854    si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
855 
856    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
857    radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
858    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
859    radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
860    radeon_set_uconfig_reg(
861       cs, R_036020_CP_PERFMON_CNTL,
862       S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
863 }
864 
si_pc_emit_read(struct si_context * sctx,struct si_pc_block * block,unsigned count,uint64_t va)865 static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count,
866                             uint64_t va)
867 {
868    struct si_pc_block_base *regs = block->b->b;
869    struct radeon_cmdbuf *cs = sctx->gfx_cs;
870    unsigned idx;
871    unsigned reg = regs->counter0_lo;
872    unsigned reg_delta = 8;
873 
874    if (!(regs->layout & SI_PC_FAKE)) {
875       if (regs->layout & SI_PC_REG_REVERSE)
876          reg_delta = -reg_delta;
877 
878       for (idx = 0; idx < count; ++idx) {
879          if (regs->counters)
880             reg = regs->counters[idx];
881 
882          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
883          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
884                             COPY_DATA_COUNT_SEL); /* 64 bits */
885          radeon_emit(cs, reg >> 2);
886          radeon_emit(cs, 0); /* unused */
887          radeon_emit(cs, va);
888          radeon_emit(cs, va >> 32);
889          va += sizeof(uint64_t);
890          reg += reg_delta;
891       }
892    } else {
893       for (idx = 0; idx < count; ++idx) {
894          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
895          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
896                             COPY_DATA_COUNT_SEL);
897          radeon_emit(cs, 0); /* immediate */
898          radeon_emit(cs, 0);
899          radeon_emit(cs, va);
900          radeon_emit(cs, va >> 32);
901          va += sizeof(uint64_t);
902       }
903    }
904 }
905 
si_pc_query_destroy(struct si_context * sctx,struct si_query * squery)906 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
907 {
908    struct si_query_pc *query = (struct si_query_pc *)squery;
909 
910    while (query->groups) {
911       struct si_query_group *group = query->groups;
912       query->groups = group->next;
913       FREE(group);
914    }
915 
916    FREE(query->counters);
917 
918    si_query_buffer_destroy(sctx->screen, &query->buffer);
919    FREE(query);
920 }
921 
si_inhibit_clockgating(struct si_context * sctx,bool inhibit)922 static void si_inhibit_clockgating(struct si_context *sctx, bool inhibit)
923 {
924    if (sctx->chip_class >= GFX10) {
925       radeon_set_uconfig_reg(sctx->gfx_cs, R_037390_RLC_PERFMON_CLK_CNTL,
926                             S_037390_PERFMON_CLOCK_STATE(inhibit));
927    } else if (sctx->chip_class >= GFX8) {
928       radeon_set_uconfig_reg(sctx->gfx_cs, R_0372FC_RLC_PERFMON_CLK_CNTL,
929                             S_0372FC_PERFMON_CLOCK_STATE(inhibit));
930    }
931 }
932 
si_pc_query_resume(struct si_context * sctx,struct si_query * squery)933 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
934 /*
935                                    struct si_query_hw *hwquery,
936                                    struct si_resource *buffer, uint64_t va)*/
937 {
938    struct si_query_pc *query = (struct si_query_pc *)squery;
939    int current_se = -1;
940    int current_instance = -1;
941 
942    if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
943       return;
944    si_need_gfx_cs_space(sctx, 0);
945 
946    if (query->shaders)
947       si_pc_emit_shaders(sctx, query->shaders);
948 
949    si_inhibit_clockgating(sctx, true);
950 
951    for (struct si_query_group *group = query->groups; group; group = group->next) {
952       struct si_pc_block *block = group->block;
953 
954       if (group->se != current_se || group->instance != current_instance) {
955          current_se = group->se;
956          current_instance = group->instance;
957          si_pc_emit_instance(sctx, group->se, group->instance);
958       }
959 
960       si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
961    }
962 
963    if (current_se != -1 || current_instance != -1)
964       si_pc_emit_instance(sctx, -1, -1);
965 
966    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
967    si_pc_emit_start(sctx, query->buffer.buf, va);
968 }
969 
si_pc_query_suspend(struct si_context * sctx,struct si_query * squery)970 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
971 {
972    struct si_query_pc *query = (struct si_query_pc *)squery;
973 
974    if (!query->buffer.buf)
975       return;
976 
977    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
978    query->buffer.results_end += query->result_size;
979 
980    si_pc_emit_stop(sctx, query->buffer.buf, va);
981 
982    for (struct si_query_group *group = query->groups; group; group = group->next) {
983       struct si_pc_block *block = group->block;
984       unsigned se = group->se >= 0 ? group->se : 0;
985       unsigned se_end = se + 1;
986 
987       if ((block->b->b->flags & SI_PC_BLOCK_SE) && (group->se < 0))
988          se_end = sctx->screen->info.max_se;
989 
990       do {
991          unsigned instance = group->instance >= 0 ? group->instance : 0;
992 
993          do {
994             si_pc_emit_instance(sctx, se, instance);
995             si_pc_emit_read(sctx, block, group->num_counters, va);
996             va += sizeof(uint64_t) * group->num_counters;
997          } while (group->instance < 0 && ++instance < block->num_instances);
998       } while (++se < se_end);
999    }
1000 
1001    si_pc_emit_instance(sctx, -1, -1);
1002 
1003    si_inhibit_clockgating(sctx, false);
1004 }
1005 
si_pc_query_begin(struct si_context * ctx,struct si_query * squery)1006 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
1007 {
1008    struct si_query_pc *query = (struct si_query_pc *)squery;
1009 
1010    si_query_buffer_reset(ctx, &query->buffer);
1011 
1012    list_addtail(&query->b.active_list, &ctx->active_queries);
1013    ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1014 
1015    si_pc_query_resume(ctx, squery);
1016 
1017    return true;
1018 }
1019 
si_pc_query_end(struct si_context * ctx,struct si_query * squery)1020 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
1021 {
1022    struct si_query_pc *query = (struct si_query_pc *)squery;
1023 
1024    si_pc_query_suspend(ctx, squery);
1025 
1026    list_del(&squery->active_list);
1027    ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
1028 
1029    return query->buffer.buf != NULL;
1030 }
1031 
si_pc_query_add_result(struct si_query_pc * query,void * buffer,union pipe_query_result * result)1032 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
1033                                    union pipe_query_result *result)
1034 {
1035    uint64_t *results = buffer;
1036    unsigned i, j;
1037 
1038    for (i = 0; i < query->num_counters; ++i) {
1039       struct si_query_counter *counter = &query->counters[i];
1040 
1041       for (j = 0; j < counter->qwords; ++j) {
1042          uint32_t value = results[counter->base + j * counter->stride];
1043          result->batch[i].u64 += value;
1044       }
1045    }
1046 }
1047 
si_pc_query_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1048 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1049                                    union pipe_query_result *result)
1050 {
1051    struct si_query_pc *query = (struct si_query_pc *)squery;
1052 
1053    memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
1054 
1055    for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1056       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1057       unsigned results_base = 0;
1058       void *map;
1059 
1060       if (squery->b.flushed)
1061          map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
1062       else
1063          map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
1064 
1065       if (!map)
1066          return false;
1067 
1068       while (results_base != qbuf->results_end) {
1069          si_pc_query_add_result(query, map + results_base, result);
1070          results_base += query->result_size;
1071       }
1072    }
1073 
1074    return true;
1075 }
1076 
1077 static const struct si_query_ops batch_query_ops = {
1078    .destroy = si_pc_query_destroy,
1079    .begin = si_pc_query_begin,
1080    .end = si_pc_query_end,
1081    .get_result = si_pc_query_get_result,
1082 
1083    .suspend = si_pc_query_suspend,
1084    .resume = si_pc_query_resume,
1085 };
1086 
get_group_state(struct si_screen * screen,struct si_query_pc * query,struct si_pc_block * block,unsigned sub_gid)1087 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
1088                                               struct si_pc_block *block, unsigned sub_gid)
1089 {
1090    struct si_query_group *group = query->groups;
1091 
1092    while (group) {
1093       if (group->block == block && group->sub_gid == sub_gid)
1094          return group;
1095       group = group->next;
1096    }
1097 
1098    group = CALLOC_STRUCT(si_query_group);
1099    if (!group)
1100       return NULL;
1101 
1102    group->block = block;
1103    group->sub_gid = sub_gid;
1104 
1105    if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
1106       unsigned sub_gids = block->num_instances;
1107       unsigned shader_id;
1108       unsigned shaders;
1109       unsigned query_shaders;
1110 
1111       if (si_pc_block_has_per_se_groups(screen->perfcounters, block))
1112          sub_gids = sub_gids * screen->info.max_se;
1113       shader_id = sub_gid / sub_gids;
1114       sub_gid = sub_gid % sub_gids;
1115 
1116       shaders = si_pc_shader_type_bits[shader_id];
1117 
1118       query_shaders = query->shaders & ~SI_PC_SHADERS_WINDOWING;
1119       if (query_shaders && query_shaders != shaders) {
1120          fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
1121          FREE(group);
1122          return NULL;
1123       }
1124       query->shaders = shaders;
1125    }
1126 
1127    if (block->b->b->flags & SI_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
1128       // A non-zero value in query->shaders ensures that the shader
1129       // masking is reset unless the user explicitly requests one.
1130       query->shaders = SI_PC_SHADERS_WINDOWING;
1131    }
1132 
1133    if (si_pc_block_has_per_se_groups(screen->perfcounters, block)) {
1134       group->se = sub_gid / block->num_instances;
1135       sub_gid = sub_gid % block->num_instances;
1136    } else {
1137       group->se = -1;
1138    }
1139 
1140    if (si_pc_block_has_per_instance_groups(screen->perfcounters, block)) {
1141       group->instance = sub_gid;
1142    } else {
1143       group->instance = -1;
1144    }
1145 
1146    group->next = query->groups;
1147    query->groups = group;
1148 
1149    return group;
1150 }
1151 
si_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)1152 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
1153                                          unsigned *query_types)
1154 {
1155    struct si_screen *screen = (struct si_screen *)ctx->screen;
1156    struct si_perfcounters *pc = screen->perfcounters;
1157    struct si_pc_block *block;
1158    struct si_query_group *group;
1159    struct si_query_pc *query;
1160    unsigned base_gid, sub_gid, sub_index;
1161    unsigned i, j;
1162 
1163    if (!pc)
1164       return NULL;
1165 
1166    query = CALLOC_STRUCT(si_query_pc);
1167    if (!query)
1168       return NULL;
1169 
1170    query->b.ops = &batch_query_ops;
1171 
1172    query->num_counters = num_queries;
1173 
1174    /* Collect selectors per group */
1175    for (i = 0; i < num_queries; ++i) {
1176       unsigned sub_gid;
1177 
1178       if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
1179          goto error;
1180 
1181       block =
1182          lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
1183       if (!block)
1184          goto error;
1185 
1186       sub_gid = sub_index / block->b->selectors;
1187       sub_index = sub_index % block->b->selectors;
1188 
1189       group = get_group_state(screen, query, block, sub_gid);
1190       if (!group)
1191          goto error;
1192 
1193       if (group->num_counters >= block->b->b->num_counters) {
1194          fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
1195          goto error;
1196       }
1197       group->selectors[group->num_counters] = sub_index;
1198       ++group->num_counters;
1199    }
1200 
1201    /* Compute result bases and CS size per group */
1202    query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
1203    query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
1204 
1205    i = 0;
1206    for (group = query->groups; group; group = group->next) {
1207       struct si_pc_block *block = group->block;
1208       unsigned read_dw;
1209       unsigned instances = 1;
1210 
1211       if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1212          instances = screen->info.max_se;
1213       if (group->instance < 0)
1214          instances *= block->num_instances;
1215 
1216       group->result_base = i;
1217       query->result_size += sizeof(uint64_t) * instances * group->num_counters;
1218       i += instances * group->num_counters;
1219 
1220       read_dw = 6 * group->num_counters;
1221       query->b.num_cs_dw_suspend += instances * read_dw;
1222       query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
1223    }
1224 
1225    if (query->shaders) {
1226       if (query->shaders == SI_PC_SHADERS_WINDOWING)
1227          query->shaders = 0xffffffff;
1228    }
1229 
1230    /* Map user-supplied query array to result indices */
1231    query->counters = CALLOC(num_queries, sizeof(*query->counters));
1232    for (i = 0; i < num_queries; ++i) {
1233       struct si_query_counter *counter = &query->counters[i];
1234       struct si_pc_block *block;
1235 
1236       block =
1237          lookup_counter(pc, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
1238 
1239       sub_gid = sub_index / block->b->selectors;
1240       sub_index = sub_index % block->b->selectors;
1241 
1242       group = get_group_state(screen, query, block, sub_gid);
1243       assert(group != NULL);
1244 
1245       for (j = 0; j < group->num_counters; ++j) {
1246          if (group->selectors[j] == sub_index)
1247             break;
1248       }
1249 
1250       counter->base = group->result_base + j;
1251       counter->stride = group->num_counters;
1252 
1253       counter->qwords = 1;
1254       if ((block->b->b->flags & SI_PC_BLOCK_SE) && group->se < 0)
1255          counter->qwords = screen->info.max_se;
1256       if (group->instance < 0)
1257          counter->qwords *= block->num_instances;
1258    }
1259 
1260    return (struct pipe_query *)query;
1261 
1262 error:
1263    si_pc_query_destroy((struct si_context *)ctx, &query->b);
1264    return NULL;
1265 }
1266 
si_init_block_names(struct si_screen * screen,struct si_pc_block * block)1267 static bool si_init_block_names(struct si_screen *screen, struct si_pc_block *block)
1268 {
1269    bool per_instance_groups = si_pc_block_has_per_instance_groups(screen->perfcounters, block);
1270    bool per_se_groups = si_pc_block_has_per_se_groups(screen->perfcounters, block);
1271    unsigned i, j, k;
1272    unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
1273    unsigned namelen;
1274    char *groupname;
1275    char *p;
1276 
1277    if (per_instance_groups)
1278       groups_instance = block->num_instances;
1279    if (per_se_groups)
1280       groups_se = screen->info.max_se;
1281    if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1282       groups_shader = ARRAY_SIZE(si_pc_shader_type_bits);
1283 
1284    namelen = strlen(block->b->b->name);
1285    block->group_name_stride = namelen + 1;
1286    if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1287       block->group_name_stride += 3;
1288    if (per_se_groups) {
1289       assert(groups_se <= 10);
1290       block->group_name_stride += 1;
1291 
1292       if (per_instance_groups)
1293          block->group_name_stride += 1;
1294    }
1295    if (per_instance_groups) {
1296       assert(groups_instance <= 100);
1297       block->group_name_stride += 2;
1298    }
1299 
1300    block->group_names = MALLOC(block->num_groups * block->group_name_stride);
1301    if (!block->group_names)
1302       return false;
1303 
1304    groupname = block->group_names;
1305    for (i = 0; i < groups_shader; ++i) {
1306       const char *shader_suffix = si_pc_shader_type_suffixes[i];
1307       unsigned shaderlen = strlen(shader_suffix);
1308       for (j = 0; j < groups_se; ++j) {
1309          for (k = 0; k < groups_instance; ++k) {
1310             strcpy(groupname, block->b->b->name);
1311             p = groupname + namelen;
1312 
1313             if (block->b->b->flags & SI_PC_BLOCK_SHADER) {
1314                strcpy(p, shader_suffix);
1315                p += shaderlen;
1316             }
1317 
1318             if (per_se_groups) {
1319                p += sprintf(p, "%d", j);
1320                if (per_instance_groups)
1321                   *p++ = '_';
1322             }
1323 
1324             if (per_instance_groups)
1325                p += sprintf(p, "%d", k);
1326 
1327             groupname += block->group_name_stride;
1328          }
1329       }
1330    }
1331 
1332    assert(block->b->selectors <= 1000);
1333    block->selector_name_stride = block->group_name_stride + 4;
1334    block->selector_names =
1335       MALLOC(block->num_groups * block->b->selectors * block->selector_name_stride);
1336    if (!block->selector_names)
1337       return false;
1338 
1339    groupname = block->group_names;
1340    p = block->selector_names;
1341    for (i = 0; i < block->num_groups; ++i) {
1342       for (j = 0; j < block->b->selectors; ++j) {
1343          sprintf(p, "%s_%03d", groupname, j);
1344          p += block->selector_name_stride;
1345       }
1346       groupname += block->group_name_stride;
1347    }
1348 
1349    return true;
1350 }
1351 
si_get_perfcounter_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_info * info)1352 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
1353                             struct pipe_driver_query_info *info)
1354 {
1355    struct si_perfcounters *pc = screen->perfcounters;
1356    struct si_pc_block *block;
1357    unsigned base_gid, sub;
1358 
1359    if (!pc)
1360       return 0;
1361 
1362    if (!info) {
1363       unsigned bid, num_queries = 0;
1364 
1365       for (bid = 0; bid < pc->num_blocks; ++bid) {
1366          num_queries += pc->blocks[bid].b->selectors * pc->blocks[bid].num_groups;
1367       }
1368 
1369       return num_queries;
1370    }
1371 
1372    block = lookup_counter(pc, index, &base_gid, &sub);
1373    if (!block)
1374       return 0;
1375 
1376    if (!block->selector_names) {
1377       if (!si_init_block_names(screen, block))
1378          return 0;
1379    }
1380    info->name = block->selector_names + sub * block->selector_name_stride;
1381    info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
1382    info->max_value.u64 = 0;
1383    info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
1384    info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
1385    info->group_id = base_gid + sub / block->b->selectors;
1386    info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
1387    if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
1388       info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
1389    return 1;
1390 }
1391 
si_get_perfcounter_group_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1392 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
1393                                   struct pipe_driver_query_group_info *info)
1394 {
1395    struct si_perfcounters *pc = screen->perfcounters;
1396    struct si_pc_block *block;
1397 
1398    if (!pc)
1399       return 0;
1400 
1401    if (!info)
1402       return pc->num_groups;
1403 
1404    block = lookup_group(pc, &index);
1405    if (!block)
1406       return 0;
1407 
1408    if (!block->group_names) {
1409       if (!si_init_block_names(screen, block))
1410          return 0;
1411    }
1412    info->name = block->group_names + index * block->group_name_stride;
1413    info->num_queries = block->b->selectors;
1414    info->max_active_queries = block->b->b->num_counters;
1415    return 1;
1416 }
1417 
si_destroy_perfcounters(struct si_screen * screen)1418 void si_destroy_perfcounters(struct si_screen *screen)
1419 {
1420    struct si_perfcounters *pc = screen->perfcounters;
1421    unsigned i;
1422 
1423    if (!pc)
1424       return;
1425 
1426    for (i = 0; i < pc->num_blocks; ++i) {
1427       FREE(pc->blocks[i].group_names);
1428       FREE(pc->blocks[i].selector_names);
1429    }
1430    FREE(pc->blocks);
1431    FREE(pc);
1432    screen->perfcounters = NULL;
1433 }
1434 
si_init_perfcounters(struct si_screen * screen)1435 void si_init_perfcounters(struct si_screen *screen)
1436 {
1437    struct si_perfcounters *pc;
1438    const struct si_pc_block_gfxdescr *blocks;
1439    unsigned num_blocks;
1440    unsigned i;
1441 
1442    switch (screen->info.chip_class) {
1443    case GFX7:
1444       blocks = groups_CIK;
1445       num_blocks = ARRAY_SIZE(groups_CIK);
1446       break;
1447    case GFX8:
1448       blocks = groups_VI;
1449       num_blocks = ARRAY_SIZE(groups_VI);
1450       break;
1451    case GFX9:
1452       blocks = groups_gfx9;
1453       num_blocks = ARRAY_SIZE(groups_gfx9);
1454       break;
1455    case GFX10:
1456    case GFX10_3:
1457       blocks = groups_gfx10;
1458       num_blocks = ARRAY_SIZE(groups_gfx10);
1459       break;
1460    case GFX6:
1461    default:
1462       return; /* not implemented */
1463    }
1464 
1465    screen->perfcounters = pc = CALLOC_STRUCT(si_perfcounters);
1466    if (!pc)
1467       return;
1468 
1469    pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
1470    pc->num_instance_cs_dwords = 3;
1471 
1472    pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
1473    pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
1474 
1475    pc->blocks = CALLOC(num_blocks, sizeof(struct si_pc_block));
1476    if (!pc->blocks)
1477       goto error;
1478    pc->num_blocks = num_blocks;
1479 
1480    for (i = 0; i < num_blocks; ++i) {
1481       struct si_pc_block *block = &pc->blocks[i];
1482       block->b = &blocks[i];
1483       block->num_instances = MAX2(1, block->b->instances);
1484 
1485       if (!strcmp(block->b->b->name, "CB") ||
1486           !strcmp(block->b->b->name, "DB") ||
1487           !strcmp(block->b->b->name, "RMI"))
1488          block->num_instances = screen->info.max_se;
1489       else if (!strcmp(block->b->b->name, "TCC"))
1490          block->num_instances = screen->info.num_tcc_blocks;
1491       else if (!strcmp(block->b->b->name, "IA"))
1492          block->num_instances = MAX2(1, screen->info.max_se / 2);
1493       else if (!strcmp(block->b->b->name, "TA") ||
1494                !strcmp(block->b->b->name, "TCP") ||
1495                !strcmp(block->b->b->name, "TD")) {
1496          block->num_instances = MAX2(1, screen->info.max_good_cu_per_sa);
1497       }
1498 
1499       if (si_pc_block_has_per_instance_groups(pc, block)) {
1500          block->num_groups = block->num_instances;
1501       } else {
1502          block->num_groups = 1;
1503       }
1504 
1505       if (si_pc_block_has_per_se_groups(pc, block))
1506          block->num_groups *= screen->info.max_se;
1507       if (block->b->b->flags & SI_PC_BLOCK_SHADER)
1508          block->num_groups *= ARRAY_SIZE(si_pc_shader_type_bits);
1509 
1510       pc->num_groups += block->num_groups;
1511    }
1512 
1513    return;
1514 
1515 error:
1516    si_destroy_perfcounters(screen);
1517 }
1518