• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 
26 #include "ac_perfcounter.h"
27 #include "amdgfxregs.h"
28 #include "radv_cs.h"
29 #include "radv_private.h"
30 #include "sid.h"
31 
32 void
radv_perfcounter_emit_shaders(struct radv_device * device,struct radeon_cmdbuf * cs,unsigned shaders)33 radv_perfcounter_emit_shaders(struct radv_device *device, struct radeon_cmdbuf *cs, unsigned shaders)
34 {
35    if (device->physical_device->rad_info.gfx_level >= GFX11) {
36       radeon_set_uconfig_reg(cs, R_036760_SQG_PERFCOUNTER_CTRL, shaders & 0x7f);
37    } else {
38       radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
39       radeon_emit(cs, shaders & 0x7f);
40       radeon_emit(cs, 0xffffffff);
41    }
42 }
43 
44 static void
radv_emit_windowed_counters(struct radv_device * device,struct radeon_cmdbuf * cs,int family,bool enable)45 radv_emit_windowed_counters(struct radv_device *device, struct radeon_cmdbuf *cs, int family, bool enable)
46 {
47    if (family == RADV_QUEUE_GENERAL) {
48       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
49       radeon_emit(cs, EVENT_TYPE(enable ? V_028A90_PERFCOUNTER_START : V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
50    }
51 
52    radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(enable));
53 }
54 
55 void
radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf * cs)56 radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs)
57 {
58    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
59                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
60                              S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
61 }
62 
63 void
radv_perfcounter_emit_spm_start(struct radv_device * device,struct radeon_cmdbuf * cs,int family)64 radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
65 {
66    /* Start SPM counters. */
67    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
68                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
69                              S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
70 
71    radv_emit_windowed_counters(device, cs, family, true);
72 }
73 
74 void
radv_perfcounter_emit_spm_stop(struct radv_device * device,struct radeon_cmdbuf * cs,int family)75 radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
76 {
77    radv_emit_windowed_counters(device, cs, family, false);
78 
79    /* Stop SPM counters. */
80    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
81                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
82                              S_036020_SPM_PERFMON_STATE(device->physical_device->rad_info.never_stop_sq_perf_counters
83                                                            ? V_036020_STRM_PERFMON_STATE_START_COUNTING
84                                                            : V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
85 }
86 
87 enum radv_perfcounter_op {
88    RADV_PC_OP_SUM,
89    RADV_PC_OP_MAX,
90    RADV_PC_OP_RATIO_DIVSCALE,
91    RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */
92    RADV_PC_OP_SUM_WEIGHTED_4,
93 };
94 
95 #define S_REG_SEL(x)   ((x)&0xFFFF)
96 #define G_REG_SEL(x)   ((x)&0xFFFF)
97 #define S_REG_BLOCK(x) ((x) << 16)
98 #define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF)
99 
100 #define S_REG_OFFSET(x)    ((x)&0xFFFF)
101 #define G_REG_OFFSET(x)    ((x)&0xFFFF)
102 #define S_REG_INSTANCES(x) ((x) << 16)
103 #define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF)
104 #define S_REG_CONSTANT(x)  ((x) << 31)
105 #define G_REG_CONSTANT(x)  ((x) >> 31)
106 
107 struct radv_perfcounter_impl {
108    enum radv_perfcounter_op op;
109    uint32_t regs[8];
110 };
111 
112 /* Only append to this list, never insert into the middle or remove (but can rename).
113  *
114  * The invariant we're trying to get here is counters that have the same meaning, so
115  * these can be shared between counters that have different implementations on different
116  * GPUs, but should be unique within a GPU.
117  */
118 enum radv_perfcounter_uuid {
119    RADV_PC_UUID_GPU_CYCLES,
120    RADV_PC_UUID_SHADER_WAVES,
121    RADV_PC_UUID_SHADER_INSTRUCTIONS,
122    RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU,
123    RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU,
124    RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD,
125    RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD,
126    RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE,
127    RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS,
128    RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS,
129    RADV_PC_UUID_SHADER_VALU_BUSY,
130    RADV_PC_UUID_SHADER_SALU_BUSY,
131    RADV_PC_UUID_VRAM_READ_SIZE,
132    RADV_PC_UUID_VRAM_WRITE_SIZE,
133    RADV_PC_UUID_L0_CACHE_HIT_RATIO,
134    RADV_PC_UUID_L1_CACHE_HIT_RATIO,
135    RADV_PC_UUID_L2_CACHE_HIT_RATIO,
136 };
137 
138 struct radv_perfcounter_desc {
139    struct radv_perfcounter_impl impl;
140 
141    VkPerformanceCounterUnitKHR unit;
142 
143    char name[VK_MAX_DESCRIPTION_SIZE];
144    char category[VK_MAX_DESCRIPTION_SIZE];
145    char description[VK_MAX_DESCRIPTION_SIZE];
146    enum radv_perfcounter_uuid uuid;
147 };
148 
149 #define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...)                              \
150    (struct radv_perfcounter_desc)                                                                                      \
151    {                                                                                                                   \
152       .impl = {.op = arg_op, .regs = {__VA_ARGS__}}, .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR,             \
153       .name = arg_name, .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid      \
154    }
155 
156 #define ADD_PC(op, unit, name, category, description, uuid, ...)                                                       \
157    do {                                                                                                                \
158       if (descs) {                                                                                                     \
159          descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__);                          \
160       }                                                                                                                \
161       ++*count;                                                                                                        \
162    } while (0)
163 #define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr))
164 #define CONSTANT(v)     (S_REG_CONSTANT(1) | (uint32_t)(v))
165 
166 enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) };
167 
168 enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) };
169 
170 enum {
171    GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe),
172    GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12),
173 };
174 
175 enum {
176    GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3),
177 
178    GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23),
179    GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b),
180    GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c),
181    GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59),
182    GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a),
183    GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b),
184    GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c),
185 
186    GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b),
187    GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53),
188    GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55),
189    GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63),
190    GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64),
191    GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65),
192    GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66),
193 };
194 
195 enum {
196    SQ_PERF_SEL_WAVES = CTR(SQ, 0x4),
197    SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31),
198    SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37),
199    SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b),
200    SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c),
201    SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d),
202    SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40),
203    SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45),
204    SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46),
205    SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75),
206 };
207 
208 enum {
209    TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9),
210    TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12),
211 };
212 
213 #define CTR_NUM_SIMD CONSTANT(pdev->rad_info.num_simd_per_compute_unit * pdev->rad_info.num_cu)
214 #define CTR_NUM_CUS  CONSTANT(pdev->rad_info.num_cu)
215 
216 static void
radv_query_perfcounter_descs(struct radv_physical_device * pdev,uint32_t * count,struct radv_perfcounter_desc * descs)217 radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count, struct radv_perfcounter_desc *descs)
218 {
219    *count = 0;
220 
221    ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM", "cycles the GPU is active processing a command buffer.",
222           GPU_CYCLES, GRBM_PERF_SEL_GUI_ACTIVE);
223 
224    ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES, SQ_PERF_SEL_WAVES);
225    ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed", SHADER_INSTRUCTIONS,
226           SQ_PERF_SEL_INSTS_ALL_GFX10);
227    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders", "Number of VALU Instructions executed",
228           SHADER_INSTRUCTIONS_VALU, SQ_PERF_SEL_INSTS_VALU_GFX10);
229    ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders", "Number of SALU Instructions executed",
230           SHADER_INSTRUCTIONS_SALU, SQ_PERF_SEL_INSTS_SALU_GFX10);
231    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders", "Number of VMEM load instructions executed",
232           SHADER_INSTRUCTIONS_VMEM_LOAD, SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10);
233    ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders", "Number of SMEM load instructions executed",
234           SHADER_INSTRUCTIONS_SMEM_LOAD, SQ_PERF_SEL_INSTS_SMEM_GFX10);
235    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders", "Number of VMEM store instructions executed",
236           SHADER_INSTRUCTIONS_VMEM_STORE, SQ_PERF_SEL_INSTS_TEX_STORE_GFX10);
237    ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders", "Number of LDS Instructions executed",
238           SHADER_INSTRUCTIONS_LDS, SQ_PERF_SEL_INSTS_LDS_GFX10);
239    ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders", "Number of GDS Instructions executed",
240           SHADER_INSTRUCTIONS_GDS, SQ_PERF_SEL_INSTS_GDS_GFX10);
241 
242    ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization",
243           "Percentage of time the VALU units are busy", SHADER_VALU_BUSY, SQ_PERF_SEL_INST_CYCLES_VALU_GFX10,
244           CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD);
245    ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization",
246           "Percentage of time the SALU units are busy", SHADER_SALU_BUSY, SQ_PERF_SEL_INSTS_SALU_GFX10,
247           CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS);
248 
249    if (pdev->rad_info.gfx_level >= GFX10_3) {
250       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", "Number of bytes read from VRAM",
251              VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103, CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103,
252              CONSTANT(64), GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103,
253              CONSTANT(128));
254       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", "Number of bytes written to VRAM",
255              VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103, CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103,
256              CONSTANT(64), CONSTANT(0), CONSTANT(0), CONSTANT(0), CONSTANT(0));
257    } else {
258       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", "Number of bytes read from VRAM",
259              VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101, CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101,
260              CONSTANT(64), GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101,
261              CONSTANT(128));
262       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", "Number of bytes written to VRAM",
263              VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101, CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101,
264              CONSTANT(32), CONSTANT(0), CONSTANT(0), CONSTANT(0), CONSTANT(0));
265    }
266 
267    ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache", L0_CACHE_HIT_RATIO,
268           TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10);
269    ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache", L1_CACHE_HIT_RATIO,
270           GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ);
271    if (pdev->rad_info.gfx_level >= GFX10_3) {
272       ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", "Hit ratio of L2 cache",
273              L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103, GL2C_PERF_SEL_REQ);
274    } else {
275       ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", "Hit ratio of L2 cache",
276              L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101, GL2C_PERF_SEL_REQ);
277    }
278 }
279 
280 static bool
radv_init_perfcounter_descs(struct radv_physical_device * pdev)281 radv_init_perfcounter_descs(struct radv_physical_device *pdev)
282 {
283    if (pdev->perfcounters)
284       return true;
285 
286    uint32_t count;
287    radv_query_perfcounter_descs(pdev, &count, NULL);
288 
289    struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count);
290    if (!descs)
291       return false;
292 
293    radv_query_perfcounter_descs(pdev, &count, descs);
294    pdev->num_perfcounters = count;
295    pdev->perfcounters = descs;
296 
297    return true;
298 }
299 
300 static int
cmp_uint32_t(const void * a,const void * b)301 cmp_uint32_t(const void *a, const void *b)
302 {
303    uint32_t l = *(const uint32_t *)a;
304    uint32_t r = *(const uint32_t *)b;
305 
306    return (l < r) ? -1 : (l > r) ? 1 : 0;
307 }
308 
309 static VkResult
radv_get_counter_registers(const struct radv_physical_device * pdevice,uint32_t num_indices,const uint32_t * indices,unsigned * out_num_regs,uint32_t ** out_regs)310 radv_get_counter_registers(const struct radv_physical_device *pdevice, uint32_t num_indices, const uint32_t *indices,
311                            unsigned *out_num_regs, uint32_t **out_regs)
312 {
313    ASSERTED uint32_t num_counters = pdevice->num_perfcounters;
314    const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
315 
316    unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs);
317    uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t));
318    if (!regs)
319       return VK_ERROR_OUT_OF_HOST_MEMORY;
320 
321    unsigned reg_cnt = 0;
322    for (unsigned i = 0; i < num_indices; ++i) {
323       uint32_t index = indices[i];
324       assert(index < num_counters);
325       for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j]; ++j) {
326          if (!G_REG_CONSTANT(descs[index].impl.regs[j]))
327             regs[reg_cnt++] = descs[index].impl.regs[j];
328       }
329    }
330 
331    qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t);
332 
333    unsigned deduped_reg_cnt = 0;
334    for (unsigned i = 1; i < reg_cnt; ++i) {
335       if (regs[i] != regs[deduped_reg_cnt])
336          regs[++deduped_reg_cnt] = regs[i];
337    }
338    ++deduped_reg_cnt;
339 
340    *out_num_regs = deduped_reg_cnt;
341    *out_regs = regs;
342    return VK_SUCCESS;
343 }
344 
345 static unsigned
radv_pc_get_num_instances(const struct radv_physical_device * pdevice,struct ac_pc_block * ac_block)346 radv_pc_get_num_instances(const struct radv_physical_device *pdevice, struct ac_pc_block *ac_block)
347 {
348    return ac_block->num_instances * ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdevice->rad_info.max_se : 1);
349 }
350 
351 static unsigned
radv_get_num_counter_passes(const struct radv_physical_device * pdevice,unsigned num_regs,const uint32_t * regs)352 radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned num_regs, const uint32_t *regs)
353 {
354    enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK;
355    unsigned block_reg_count = 0;
356    struct ac_pc_block *ac_block = NULL;
357    unsigned passes_needed = 1;
358 
359    for (unsigned i = 0; i < num_regs; ++i) {
360       enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]);
361 
362       if (block != prev_block) {
363          block_reg_count = 0;
364          prev_block = block;
365          ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
366       }
367 
368       ++block_reg_count;
369 
370       passes_needed = MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters));
371    }
372 
373    return passes_needed;
374 }
375 
376 void
radv_pc_deinit_query_pool(struct radv_pc_query_pool * pool)377 radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool)
378 {
379    free(pool->counters);
380    free(pool->pc_regs);
381 }
382 
383 VkResult
radv_pc_init_query_pool(struct radv_physical_device * pdevice,const VkQueryPoolCreateInfo * pCreateInfo,struct radv_pc_query_pool * pool)384 radv_pc_init_query_pool(struct radv_physical_device *pdevice, const VkQueryPoolCreateInfo *pCreateInfo,
385                         struct radv_pc_query_pool *pool)
386 {
387    const VkQueryPoolPerformanceCreateInfoKHR *perf_info =
388       vk_find_struct_const(pCreateInfo->pNext, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
389    VkResult result;
390 
391    if (!radv_init_perfcounter_descs(pdevice))
392       return VK_ERROR_OUT_OF_HOST_MEMORY;
393 
394    result = radv_get_counter_registers(pdevice, perf_info->counterIndexCount, perf_info->pCounterIndices,
395                                        &pool->num_pc_regs, &pool->pc_regs);
396    if (result != VK_SUCCESS)
397       return result;
398 
399    pool->num_passes = radv_get_num_counter_passes(pdevice, pool->num_pc_regs, pool->pc_regs);
400 
401    uint32_t *pc_reg_offsets = malloc(pool->num_pc_regs * sizeof(uint32_t));
402    if (!pc_reg_offsets)
403       return VK_ERROR_OUT_OF_HOST_MEMORY;
404 
405    unsigned offset = 0;
406    for (unsigned i = 0; i < pool->num_pc_regs; ++i) {
407       enum ac_pc_gpu_block block = pool->pc_regs[i] >> 16;
408       struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
409       unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
410 
411       pc_reg_offsets[i] = S_REG_OFFSET(offset) | S_REG_INSTANCES(num_instances);
412       offset += sizeof(uint64_t) * 2 * num_instances;
413    }
414 
415    /* allow an uint32_t per pass to signal completion. */
416    pool->b.stride = offset + 8 * pool->num_passes;
417 
418    pool->num_counters = perf_info->counterIndexCount;
419    pool->counters = malloc(pool->num_counters * sizeof(struct radv_perfcounter_impl));
420    if (!pool->counters) {
421       free(pc_reg_offsets);
422       return VK_ERROR_OUT_OF_HOST_MEMORY;
423    }
424 
425    for (unsigned i = 0; i < pool->num_counters; ++i) {
426       pool->counters[i] = pdevice->perfcounters[perf_info->pCounterIndices[i]].impl;
427 
428       for (unsigned j = 0; j < ARRAY_SIZE(pool->counters[i].regs); ++j) {
429          uint32_t reg = pool->counters[i].regs[j];
430          if (!reg || G_REG_CONSTANT(reg))
431             continue;
432 
433          unsigned k;
434          for (k = 0; k < pool->num_pc_regs; ++k)
435             if (pool->pc_regs[k] == reg)
436                break;
437          pool->counters[i].regs[j] = pc_reg_offsets[k];
438       }
439    }
440 
441    free(pc_reg_offsets);
442    return VK_SUCCESS;
443 }
444 
445 static void
radv_emit_instance(struct radv_cmd_buffer * cmd_buffer,int se,int instance)446 radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance)
447 {
448    struct radeon_cmdbuf *cs = cmd_buffer->cs;
449    unsigned value = S_030800_SH_BROADCAST_WRITES(1);
450 
451    if (se >= 0) {
452       value |= S_030800_SE_INDEX(se);
453    } else {
454       value |= S_030800_SE_BROADCAST_WRITES(1);
455    }
456 
457    if (instance >= 0) {
458       value |= S_030800_INSTANCE_INDEX(instance);
459    } else {
460       value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
461    }
462 
463    radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
464 }
465 
466 static void
radv_emit_select(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,unsigned * selectors)467 radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, unsigned *selectors)
468 {
469    const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
470    const enum radv_queue_family qf = cmd_buffer->qf;
471    struct ac_pc_block_base *regs = block->b->b;
472    struct radeon_cmdbuf *cs = cmd_buffer->cs;
473    unsigned idx;
474 
475    assert(count <= regs->num_counters);
476 
477    /* Fake counters. */
478    if (!regs->select0)
479       return;
480 
481    for (idx = 0; idx < count; ++idx) {
482       radeon_set_perfctr_reg(gfx_level, qf, cs, regs->select0[idx], G_REG_SEL(selectors[idx]) | regs->select_or);
483    }
484 
485    for (idx = 0; idx < regs->num_spm_counters; idx++) {
486       radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1);
487       radeon_emit(cs, 0);
488    }
489 }
490 
491 static void
radv_pc_emit_block_instance_read(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,uint64_t va)492 radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count,
493                                  uint64_t va)
494 {
495    struct ac_pc_block_base *regs = block->b->b;
496    struct radeon_cmdbuf *cs = cmd_buffer->cs;
497    unsigned reg = regs->counter0_lo;
498    unsigned reg_delta = 8;
499 
500    assert(regs->select0);
501    for (unsigned idx = 0; idx < count; ++idx) {
502       if (regs->counters)
503          reg = regs->counters[idx];
504 
505       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
506       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM |
507                          COPY_DATA_COUNT_SEL); /* 64 bits */
508       radeon_emit(cs, reg >> 2);
509       radeon_emit(cs, 0); /* unused */
510       radeon_emit(cs, va);
511       radeon_emit(cs, va >> 32);
512 
513       va += sizeof(uint64_t) * 2 * radv_pc_get_num_instances(cmd_buffer->device->physical_device, block);
514       reg += reg_delta;
515    }
516 }
517 
518 static void
radv_pc_sample_block(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,uint64_t va)519 radv_pc_sample_block(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, uint64_t va)
520 {
521    unsigned se_end = 1;
522    if (block->b->b->flags & AC_PC_BLOCK_SE)
523       se_end = cmd_buffer->device->physical_device->rad_info.max_se;
524 
525    for (unsigned se = 0; se < se_end; ++se) {
526       for (unsigned instance = 0; instance < block->num_instances; ++instance) {
527          radv_emit_instance(cmd_buffer, se, instance);
528          radv_pc_emit_block_instance_read(cmd_buffer, block, count, va);
529          va += sizeof(uint64_t) * 2;
530       }
531    }
532 }
533 
534 static void
radv_pc_wait_idle(struct radv_cmd_buffer * cmd_buffer)535 radv_pc_wait_idle(struct radv_cmd_buffer *cmd_buffer)
536 {
537    struct radeon_cmdbuf *cs = cmd_buffer->cs;
538 
539    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
540    radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
541 
542    radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
543    radeon_emit(cs, 0);          /* CP_COHER_CNTL */
544    radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
545    radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
546    radeon_emit(cs, 0);          /* CP_COHER_BASE */
547    radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
548    radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
549    radeon_emit(cs, 0);          /* GCR_CNTL */
550 
551    radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
552    radeon_emit(cs, 0);
553 }
554 
555 static void
radv_pc_stop_and_sample(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va,bool end)556 radv_pc_stop_and_sample(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va, bool end)
557 {
558    struct radeon_cmdbuf *cs = cmd_buffer->cs;
559    struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
560 
561    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
562    radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
563 
564    radv_pc_wait_idle(cmd_buffer);
565 
566    radv_emit_instance(cmd_buffer, -1, -1);
567    radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, false);
568 
569    radeon_set_uconfig_reg(
570       cs, R_036020_CP_PERFMON_CNTL,
571       S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
572 
573    for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
574       uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_PASS_OFFSET + 8 * pass;
575       uint64_t reg_va = va + (end ? 8 : 0);
576 
577       radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
578       radeon_emit(cs, pred_va);
579       radeon_emit(cs, pred_va >> 32);
580       radeon_emit(cs, 0); /* Cache policy */
581 
582       uint32_t *skip_dwords = cs->buf + cs->cdw;
583       radeon_emit(cs, 0);
584 
585       for (unsigned i = 0; i < pool->num_pc_regs;) {
586          enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
587          struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
588          unsigned offset = ac_block->num_instances * pass;
589          unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
590 
591          unsigned cnt = 1;
592          while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
593             ++cnt;
594 
595          if (offset < cnt) {
596             unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
597             radv_pc_sample_block(cmd_buffer, ac_block, pass_reg_cnt,
598                                  reg_va + offset * num_instances * sizeof(uint64_t));
599          }
600 
601          i += cnt;
602          reg_va += num_instances * sizeof(uint64_t) * 2 * cnt;
603       }
604 
605       if (end) {
606          uint64_t signal_va = va + pool->b.stride - 8 - 8 * pass;
607          radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
608          radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
609          radeon_emit(cs, signal_va);
610          radeon_emit(cs, signal_va >> 32);
611          radeon_emit(cs, 1); /* value */
612       }
613 
614       *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
615    }
616 
617    radv_emit_instance(cmd_buffer, -1, -1);
618 }
619 
620 void
radv_pc_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va)621 radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va)
622 {
623    struct radeon_cmdbuf *cs = cmd_buffer->cs;
624    struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
625    ASSERTED unsigned cdw_max;
626 
627    cmd_buffer->state.uses_perf_counters = true;
628 
629    cdw_max = radeon_check_space(cmd_buffer->device->ws, cs,
630                                 256 +                      /* Random one time stuff */
631                                    10 * pool->num_passes + /* COND_EXECs */
632                                    pool->b.stride / 8 * (5 + 8));
633 
634    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
635    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
636 
637    uint64_t perf_ctr_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
638    radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
639    radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
640    radeon_emit(cs, perf_ctr_va);
641    radeon_emit(cs, perf_ctr_va >> 32);
642    radeon_emit(cs, 0); /* value */
643 
644    radv_pc_wait_idle(cmd_buffer);
645 
646    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
647                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
648 
649    radv_emit_inhibit_clockgating(cmd_buffer->device, cs, true);
650    radv_emit_spi_config_cntl(cmd_buffer->device, cs, true);
651    radv_perfcounter_emit_shaders(cmd_buffer->device, cs, 0x7f);
652 
653    for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
654       uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_PASS_OFFSET + 8 * pass;
655 
656       radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
657       radeon_emit(cs, pred_va);
658       radeon_emit(cs, pred_va >> 32);
659       radeon_emit(cs, 0); /* Cache policy */
660 
661       uint32_t *skip_dwords = cs->buf + cs->cdw;
662       radeon_emit(cs, 0);
663 
664       for (unsigned i = 0; i < pool->num_pc_regs;) {
665          enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
666          struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
667          unsigned offset = ac_block->num_instances * pass;
668 
669          unsigned cnt = 1;
670          while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
671             ++cnt;
672 
673          if (offset < cnt) {
674             unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
675             radv_emit_select(cmd_buffer, ac_block, pass_reg_cnt, pool->pc_regs + i + offset);
676          }
677 
678          i += cnt;
679       }
680 
681       *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
682    }
683 
684    radv_emit_instance(cmd_buffer, -1, -1);
685 
686    /* The following sequence actually starts the perfcounters. */
687 
688    radv_pc_stop_and_sample(cmd_buffer, pool, va, false);
689 
690    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
691                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
692 
693    radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, true);
694 
695    assert(cmd_buffer->cs->cdw <= cdw_max);
696 }
697 
698 void
radv_pc_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va)699 radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va)
700 {
701    struct radeon_cmdbuf *cs = cmd_buffer->cs;
702    ASSERTED unsigned cdw_max;
703 
704    cdw_max = radeon_check_space(cmd_buffer->device->ws, cs,
705                                 256 + /* Reserved for things that don't scale with passes/counters */
706                                    5 * pool->num_passes + /* COND_EXECs */
707                                    pool->b.stride / 8 * 8);
708 
709    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
710    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
711 
712    uint64_t perf_ctr_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
713    radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
714                                 V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, perf_ctr_va,
715                                 1, cmd_buffer->gfx9_fence_va);
716    radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, perf_ctr_va, 1, 0xffffffff);
717 
718    radv_pc_wait_idle(cmd_buffer);
719    radv_pc_stop_and_sample(cmd_buffer, pool, va, true);
720 
721    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
722                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
723    radv_emit_spi_config_cntl(cmd_buffer->device, cs, false);
724    radv_emit_inhibit_clockgating(cmd_buffer->device, cs, false);
725 
726    assert(cmd_buffer->cs->cdw <= cdw_max);
727 }
728 
729 static uint64_t
radv_pc_sum_reg(uint32_t reg,const uint64_t * data)730 radv_pc_sum_reg(uint32_t reg, const uint64_t *data)
731 {
732    unsigned instances = G_REG_INSTANCES(reg);
733    unsigned offset = G_REG_OFFSET(reg) / 8;
734    uint64_t result = 0;
735 
736    if (G_REG_CONSTANT(reg))
737       return reg & 0x7fffffffu;
738 
739    for (unsigned i = 0; i < instances; ++i) {
740       result += data[offset + 2 * i + 1] - data[offset + 2 * i];
741    }
742 
743    return result;
744 }
745 
746 static uint64_t
radv_pc_max_reg(uint32_t reg,const uint64_t * data)747 radv_pc_max_reg(uint32_t reg, const uint64_t *data)
748 {
749    unsigned instances = G_REG_INSTANCES(reg);
750    unsigned offset = G_REG_OFFSET(reg) / 8;
751    uint64_t result = 0;
752 
753    if (G_REG_CONSTANT(reg))
754       return reg & 0x7fffffffu;
755 
756    for (unsigned i = 0; i < instances; ++i) {
757       result = MAX2(result, data[offset + 2 * i + 1]);
758    }
759 
760    return result;
761 }
762 
763 static union VkPerformanceCounterResultKHR
radv_pc_get_result(const struct radv_perfcounter_impl * impl,const uint64_t * data)764 radv_pc_get_result(const struct radv_perfcounter_impl *impl, const uint64_t *data)
765 {
766    union VkPerformanceCounterResultKHR result;
767 
768    switch (impl->op) {
769    case RADV_PC_OP_MAX:
770       result.float64 = radv_pc_max_reg(impl->regs[0], data);
771       break;
772    case RADV_PC_OP_SUM:
773       result.float64 = radv_pc_sum_reg(impl->regs[0], data);
774       break;
775    case RADV_PC_OP_RATIO_DIVSCALE:
776       result.float64 = radv_pc_sum_reg(impl->regs[0], data) / (double)radv_pc_sum_reg(impl->regs[1], data) /
777                        radv_pc_sum_reg(impl->regs[2], data) * 100.0;
778       break;
779    case RADV_PC_OP_REVERSE_RATIO: {
780       double tmp = radv_pc_sum_reg(impl->regs[1], data);
781       result.float64 = (tmp - radv_pc_sum_reg(impl->regs[0], data)) / tmp * 100.0;
782       break;
783    }
784    case RADV_PC_OP_SUM_WEIGHTED_4:
785       result.float64 = 0.0;
786       for (unsigned i = 0; i < 4; ++i)
787          result.float64 += radv_pc_sum_reg(impl->regs[2 * i], data) * radv_pc_sum_reg(impl->regs[2 * i + 1], data);
788       break;
789    default:
790       unreachable("unhandled performance counter operation");
791    }
792    return result;
793 }
794 
795 void
radv_pc_get_results(const struct radv_pc_query_pool * pc_pool,const uint64_t * data,void * out)796 radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out)
797 {
798    union VkPerformanceCounterResultKHR *pc_result = out;
799 
800    for (unsigned i = 0; i < pc_pool->num_counters; ++i) {
801       pc_result[i] = radv_pc_get_result(pc_pool->counters + i, data);
802    }
803 }
804 
805 VKAPI_ATTR VkResult VKAPI_CALL
radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)806 radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
807    VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount,
808    VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
809 {
810    RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
811 
812    if (vk_queue_to_radv(pdevice, queueFamilyIndex) != RADV_QUEUE_GENERAL) {
813       *pCounterCount = 0;
814       return VK_SUCCESS;
815    }
816 
817    if (!radv_init_perfcounter_descs(pdevice))
818       return VK_ERROR_OUT_OF_HOST_MEMORY;
819 
820    uint32_t counter_cnt = pdevice->num_perfcounters;
821    const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
822 
823    if (!pCounters && !pCounterDescriptions) {
824       *pCounterCount = counter_cnt;
825       return VK_SUCCESS;
826    }
827 
828    VkResult result = counter_cnt > *pCounterCount ? VK_INCOMPLETE : VK_SUCCESS;
829    counter_cnt = MIN2(counter_cnt, *pCounterCount);
830    *pCounterCount = counter_cnt;
831 
832    for (uint32_t i = 0; i < counter_cnt; ++i) {
833       if (pCounters) {
834          pCounters[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_KHR;
835          pCounters[i].unit = descs[i].unit;
836          pCounters[i].scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
837          pCounters[i].storage = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR;
838 
839          memset(&pCounters[i].uuid, 0, sizeof(pCounters[i].uuid));
840          strcpy((char *)&pCounters[i].uuid, "RADV");
841 
842          const uint32_t uuid = descs[i].uuid;
843          memcpy(&pCounters[i].uuid[12], &uuid, sizeof(uuid));
844       }
845 
846       if (pCounterDescriptions) {
847          pCounterDescriptions[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_DESCRIPTION_KHR;
848          pCounterDescriptions[i].flags = VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR;
849          strcpy(pCounterDescriptions[i].name, descs[i].name);
850          strcpy(pCounterDescriptions[i].category, descs[i].category);
851          strcpy(pCounterDescriptions[i].description, descs[i].description);
852       }
853    }
854    return result;
855 }
856 
857 VKAPI_ATTR void VKAPI_CALL
radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)858 radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
859    VkPhysicalDevice physicalDevice, const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
860    uint32_t *pNumPasses)
861 {
862    RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
863 
864    if (pPerformanceQueryCreateInfo->counterIndexCount == 0) {
865       *pNumPasses = 0;
866       return;
867    }
868 
869    if (!radv_init_perfcounter_descs(pdevice)) {
870       /* Can't return an error, so log */
871       fprintf(stderr, "radv: Failed to init perf counters\n");
872       *pNumPasses = 1;
873       return;
874    }
875 
876    assert(vk_queue_to_radv(pdevice, pPerformanceQueryCreateInfo->queueFamilyIndex) == RADV_QUEUE_GENERAL);
877 
878    unsigned num_regs = 0;
879    uint32_t *regs = NULL;
880    VkResult result = radv_get_counter_registers(pdevice, pPerformanceQueryCreateInfo->counterIndexCount,
881                                                 pPerformanceQueryCreateInfo->pCounterIndices, &num_regs, &regs);
882    if (result != VK_SUCCESS) {
883       /* Can't return an error, so log */
884       fprintf(stderr, "radv: Failed to allocate memory for perf counters\n");
885    }
886 
887    *pNumPasses = radv_get_num_counter_passes(pdevice, num_regs, regs);
888    free(regs);
889 }
890