• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 
26 #include "ac_perfcounter.h"
27 #include "amdgfxregs.h"
28 #include "radv_cs.h"
29 #include "radv_private.h"
30 #include "sid.h"
31 
32 void
radv_perfcounter_emit_shaders(struct radeon_cmdbuf * cs,unsigned shaders)33 radv_perfcounter_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
34 {
35    radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
36    radeon_emit(cs, shaders & 0x7f);
37    radeon_emit(cs, 0xffffffff);
38 }
39 
40 static void
radv_emit_windowed_counters(struct radv_device * device,struct radeon_cmdbuf * cs,int family,bool enable)41 radv_emit_windowed_counters(struct radv_device *device, struct radeon_cmdbuf *cs, int family,
42                             bool enable)
43 {
44    if (family == RADV_QUEUE_GENERAL) {
45       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
46       radeon_emit(cs, EVENT_TYPE(enable ? V_028A90_PERFCOUNTER_START : V_028A90_PERFCOUNTER_STOP) |
47                          EVENT_INDEX(0));
48    }
49 
50    radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(enable));
51 }
52 
53 void
radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf * cs)54 radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs)
55 {
56    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
57                               S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
58                               S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
59 }
60 
61 void
radv_perfcounter_emit_spm_start(struct radv_device * device,struct radeon_cmdbuf * cs,int family)62 radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
63 {
64    /* Start SPM counters. */
65    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
66                               S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
67                               S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
68 
69    radv_emit_windowed_counters(device, cs, family, true);
70 }
71 
72 void
radv_perfcounter_emit_spm_stop(struct radv_device * device,struct radeon_cmdbuf * cs,int family)73 radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
74 {
75    radv_emit_windowed_counters(device, cs, family, false);
76 
77    /* Stop SPM counters. */
78    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
79                               S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
80                               S_036020_SPM_PERFMON_STATE(device->physical_device->rad_info.never_stop_sq_perf_counters ?
81                                                             V_036020_STRM_PERFMON_STATE_START_COUNTING :
82                                                             V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
83 }
84 
85 enum radv_perfcounter_op {
86    RADV_PC_OP_SUM,
87    RADV_PC_OP_MAX,
88    RADV_PC_OP_RATIO_DIVSCALE,
89    RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */
90    RADV_PC_OP_SUM_WEIGHTED_4,
91 };
92 
93 #define S_REG_SEL(x)   ((x)&0xFFFF)
94 #define G_REG_SEL(x)   ((x)&0xFFFF)
95 #define S_REG_BLOCK(x) ((x) << 16)
96 #define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF)
97 
98 #define S_REG_OFFSET(x)    ((x)&0xFFFF)
99 #define G_REG_OFFSET(x)    ((x)&0xFFFF)
100 #define S_REG_INSTANCES(x) ((x) << 16)
101 #define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF)
102 #define S_REG_CONSTANT(x)  ((x) << 31)
103 #define G_REG_CONSTANT(x)  ((x) >> 31)
104 
105 struct radv_perfcounter_impl {
106    enum radv_perfcounter_op op;
107    uint32_t regs[8];
108 };
109 
110 /* Only append to this list, never insert into the middle or remove (but can rename).
111  *
112  * The invariant we're trying to get here is counters that have the same meaning, so
113  * these can be shared between counters that have different implementations on different
114  * GPUs, but should be unique within a GPU.
115  */
116 enum radv_perfcounter_uuid {
117    RADV_PC_UUID_GPU_CYCLES,
118    RADV_PC_UUID_SHADER_WAVES,
119    RADV_PC_UUID_SHADER_INSTRUCTIONS,
120    RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU,
121    RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU,
122    RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD,
123    RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD,
124    RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE,
125    RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS,
126    RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS,
127    RADV_PC_UUID_SHADER_VALU_BUSY,
128    RADV_PC_UUID_SHADER_SALU_BUSY,
129    RADV_PC_UUID_VRAM_READ_SIZE,
130    RADV_PC_UUID_VRAM_WRITE_SIZE,
131    RADV_PC_UUID_L0_CACHE_HIT_RATIO,
132    RADV_PC_UUID_L1_CACHE_HIT_RATIO,
133    RADV_PC_UUID_L2_CACHE_HIT_RATIO,
134 };
135 
136 struct radv_perfcounter_desc {
137    struct radv_perfcounter_impl impl;
138 
139    VkPerformanceCounterUnitKHR unit;
140 
141    char name[VK_MAX_DESCRIPTION_SIZE];
142    char category[VK_MAX_DESCRIPTION_SIZE];
143    char description[VK_MAX_DESCRIPTION_SIZE];
144    enum radv_perfcounter_uuid uuid;
145 };
146 
147 #define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...)          \
148    (struct radv_perfcounter_desc)                                                                  \
149    {                                                                                               \
150       .impl = {.op = arg_op, .regs = {__VA_ARGS__}},                                               \
151       .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR, .name = arg_name,                      \
152       .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid    \
153    }
154 
155 #define ADD_PC(op, unit, name, category, description, uuid, ...)                                   \
156    do {                                                                                            \
157       if (descs) {                                                                                 \
158          descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__);      \
159       }                                                                                            \
160       ++*count;                                                                                    \
161    } while (0)
162 #define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr))
163 #define CONSTANT(v)     (S_REG_CONSTANT(1) | (uint32_t)(v))
164 
165 enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) };
166 
167 enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) };
168 
169 enum {
170    GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe),
171    GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12),
172 };
173 
174 enum {
175    GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3),
176 
177    GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23),
178    GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b),
179    GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c),
180    GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59),
181    GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a),
182    GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b),
183    GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c),
184 
185    GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b),
186    GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53),
187    GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55),
188    GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63),
189    GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64),
190    GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65),
191    GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66),
192 };
193 
194 enum {
195    SQ_PERF_SEL_WAVES = CTR(SQ, 0x4),
196    SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31),
197    SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37),
198    SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b),
199    SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c),
200    SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d),
201    SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40),
202    SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45),
203    SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46),
204    SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75),
205 };
206 
207 enum {
208    TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9),
209    TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12),
210 };
211 
212 #define CTR_NUM_SIMD                                                                               \
213    CONSTANT(pdev->rad_info.num_simd_per_compute_unit * pdev->rad_info.num_cu)
214 #define CTR_NUM_CUS CONSTANT(pdev->rad_info.num_cu)
215 
216 static void
radv_query_perfcounter_descs(struct radv_physical_device * pdev,uint32_t * count,struct radv_perfcounter_desc * descs)217 radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count,
218                              struct radv_perfcounter_desc *descs)
219 {
220    *count = 0;
221 
222    ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM",
223           "cycles the GPU is active processing a command buffer.", GPU_CYCLES,
224           GRBM_PERF_SEL_GUI_ACTIVE);
225 
226    ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES,
227           SQ_PERF_SEL_WAVES);
228    ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed",
229           SHADER_INSTRUCTIONS, SQ_PERF_SEL_INSTS_ALL_GFX10);
230    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders",
231           "Number of VALU Instructions executed", SHADER_INSTRUCTIONS_VALU,
232           SQ_PERF_SEL_INSTS_VALU_GFX10);
233    ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders",
234           "Number of SALU Instructions executed", SHADER_INSTRUCTIONS_SALU,
235           SQ_PERF_SEL_INSTS_SALU_GFX10);
236    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders",
237           "Number of VMEM load instructions executed", SHADER_INSTRUCTIONS_VMEM_LOAD,
238           SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10);
239    ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders",
240           "Number of SMEM load instructions executed", SHADER_INSTRUCTIONS_SMEM_LOAD,
241           SQ_PERF_SEL_INSTS_SMEM_GFX10);
242    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders",
243           "Number of VMEM store instructions executed", SHADER_INSTRUCTIONS_VMEM_STORE,
244           SQ_PERF_SEL_INSTS_TEX_STORE_GFX10);
245    ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders",
246           "Number of LDS Instructions executed", SHADER_INSTRUCTIONS_LDS,
247           SQ_PERF_SEL_INSTS_LDS_GFX10);
248    ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders",
249           "Number of GDS Instructions executed", SHADER_INSTRUCTIONS_GDS,
250           SQ_PERF_SEL_INSTS_GDS_GFX10);
251 
252    ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization",
253           "Percentage of time the VALU units are busy", SHADER_VALU_BUSY,
254           SQ_PERF_SEL_INST_CYCLES_VALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD);
255    ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization",
256           "Percentage of time the SALU units are busy", SHADER_SALU_BUSY,
257           SQ_PERF_SEL_INSTS_SALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS);
258 
259    if (pdev->rad_info.gfx_level >= GFX10_3) {
260       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory",
261              "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103,
262              CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103, CONSTANT(64),
263              GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103,
264              CONSTANT(128));
265       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory",
266              "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103,
267              CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103, CONSTANT(64), CONSTANT(0),
268              CONSTANT(0), CONSTANT(0), CONSTANT(0));
269    } else {
270       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory",
271              "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101,
272              CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101, CONSTANT(64),
273              GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101,
274              CONSTANT(128));
275       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory",
276              "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101,
277              CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101, CONSTANT(32), CONSTANT(0),
278              CONSTANT(0), CONSTANT(0), CONSTANT(0));
279    }
280 
281    ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache",
282           L0_CACHE_HIT_RATIO, TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10);
283    ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache",
284           L1_CACHE_HIT_RATIO, GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ);
285    if (pdev->rad_info.gfx_level >= GFX10_3) {
286       ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory",
287              "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103,
288              GL2C_PERF_SEL_REQ);
289    } else {
290       ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory",
291              "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101,
292              GL2C_PERF_SEL_REQ);
293    }
294 }
295 
296 static bool
radv_init_perfcounter_descs(struct radv_physical_device * pdev)297 radv_init_perfcounter_descs(struct radv_physical_device *pdev)
298 {
299    if (pdev->perfcounters)
300       return true;
301 
302    uint32_t count;
303    radv_query_perfcounter_descs(pdev, &count, NULL);
304 
305    struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count);
306    if (!descs)
307       return false;
308 
309    radv_query_perfcounter_descs(pdev, &count, descs);
310    pdev->num_perfcounters = count;
311    pdev->perfcounters = descs;
312 
313    return true;
314 }
315 
316 static int
cmp_uint32_t(const void * a,const void * b)317 cmp_uint32_t(const void *a, const void *b)
318 {
319    uint32_t l = *(const uint32_t *)a;
320    uint32_t r = *(const uint32_t *)b;
321 
322    return (l < r) ? -1 : (l > r) ? 1 : 0;
323 }
324 
325 static VkResult
radv_get_counter_registers(const struct radv_physical_device * pdevice,uint32_t num_indices,const uint32_t * indices,unsigned * out_num_regs,uint32_t ** out_regs)326 radv_get_counter_registers(const struct radv_physical_device *pdevice, uint32_t num_indices,
327                            const uint32_t *indices, unsigned *out_num_regs, uint32_t **out_regs)
328 {
329    ASSERTED uint32_t num_counters = pdevice->num_perfcounters;
330    const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
331 
332    unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs);
333    uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t));
334    if (!regs)
335       return VK_ERROR_OUT_OF_HOST_MEMORY;
336 
337    unsigned reg_cnt = 0;
338    for (unsigned i = 0; i < num_indices; ++i) {
339       uint32_t index = indices[i];
340       assert(index < num_counters);
341       for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j];
342            ++j) {
343          if (!G_REG_CONSTANT(descs[index].impl.regs[j]))
344             regs[reg_cnt++] = descs[index].impl.regs[j];
345       }
346    }
347 
348    qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t);
349 
350    unsigned deduped_reg_cnt = 0;
351    for (unsigned i = 1; i < reg_cnt; ++i) {
352       if (regs[i] != regs[deduped_reg_cnt])
353          regs[++deduped_reg_cnt] = regs[i];
354    }
355    ++deduped_reg_cnt;
356 
357    *out_num_regs = deduped_reg_cnt;
358    *out_regs = regs;
359    return VK_SUCCESS;
360 }
361 
362 static unsigned
radv_pc_get_num_instances(const struct radv_physical_device * pdevice,struct ac_pc_block * ac_block)363 radv_pc_get_num_instances(const struct radv_physical_device *pdevice, struct ac_pc_block *ac_block)
364 {
365    return ac_block->num_instances *
366           ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdevice->rad_info.max_se : 1);
367 }
368 
369 static unsigned
radv_get_num_counter_passes(const struct radv_physical_device * pdevice,unsigned num_regs,const uint32_t * regs)370 radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned num_regs,
371                             const uint32_t *regs)
372 {
373    enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK;
374    unsigned block_reg_count = 0;
375    struct ac_pc_block *ac_block = NULL;
376    unsigned passes_needed = 1;
377 
378    for (unsigned i = 0; i < num_regs; ++i) {
379       enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]);
380 
381       if (block != prev_block) {
382          block_reg_count = 0;
383          prev_block = block;
384          ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
385       }
386 
387       ++block_reg_count;
388 
389       passes_needed =
390          MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters));
391    }
392 
393    return passes_needed;
394 }
395 
396 void
radv_pc_deinit_query_pool(struct radv_pc_query_pool * pool)397 radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool)
398 {
399    free(pool->counters);
400    free(pool->pc_regs);
401 }
402 
403 VkResult
radv_pc_init_query_pool(struct radv_physical_device * pdevice,const VkQueryPoolCreateInfo * pCreateInfo,struct radv_pc_query_pool * pool)404 radv_pc_init_query_pool(struct radv_physical_device *pdevice,
405                         const VkQueryPoolCreateInfo *pCreateInfo, struct radv_pc_query_pool *pool)
406 {
407    const VkQueryPoolPerformanceCreateInfoKHR *perf_info =
408       vk_find_struct_const(pCreateInfo->pNext, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
409    VkResult result;
410 
411    if (!radv_init_perfcounter_descs(pdevice))
412       return VK_ERROR_OUT_OF_HOST_MEMORY;
413 
414    result =
415       radv_get_counter_registers(pdevice, perf_info->counterIndexCount, perf_info->pCounterIndices,
416                                  &pool->num_pc_regs, &pool->pc_regs);
417    if (result != VK_SUCCESS)
418       return result;
419 
420    pool->num_passes = radv_get_num_counter_passes(pdevice, pool->num_pc_regs, pool->pc_regs);
421 
422    uint32_t *pc_reg_offsets = malloc(pool->num_pc_regs * sizeof(uint32_t));
423    if (!pc_reg_offsets)
424       return VK_ERROR_OUT_OF_HOST_MEMORY;
425 
426    unsigned offset = 0;
427    for (unsigned i = 0; i < pool->num_pc_regs; ++i) {
428       enum ac_pc_gpu_block block = pool->pc_regs[i] >> 16;
429       struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
430       unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
431 
432       pc_reg_offsets[i] = S_REG_OFFSET(offset) | S_REG_INSTANCES(num_instances);
433       offset += sizeof(uint64_t) * 2 * num_instances;
434    }
435 
436    /* allow an uint32_t per pass to signal completion. */
437    pool->b.stride = offset + 8 * pool->num_passes;
438 
439    pool->num_counters = perf_info->counterIndexCount;
440    pool->counters = malloc(pool->num_counters * sizeof(struct radv_perfcounter_impl));
441    if (!pool->counters) {
442       free(pc_reg_offsets);
443       return VK_ERROR_OUT_OF_HOST_MEMORY;
444    }
445 
446    for (unsigned i = 0; i < pool->num_counters; ++i) {
447       pool->counters[i] = pdevice->perfcounters[perf_info->pCounterIndices[i]].impl;
448 
449       for (unsigned j = 0; j < ARRAY_SIZE(pool->counters[i].regs); ++j) {
450          uint32_t reg = pool->counters[i].regs[j];
451          if (!reg || G_REG_CONSTANT(reg))
452             continue;
453 
454          unsigned k;
455          for (k = 0; k < pool->num_pc_regs; ++k)
456             if (pool->pc_regs[k] == reg)
457                break;
458          pool->counters[i].regs[j] = pc_reg_offsets[k];
459       }
460    }
461 
462    free(pc_reg_offsets);
463    return VK_SUCCESS;
464 }
465 
466 static void
radv_emit_instance(struct radv_cmd_buffer * cmd_buffer,int se,int instance)467 radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance)
468 {
469    struct radeon_cmdbuf *cs = cmd_buffer->cs;
470    unsigned value = S_030800_SH_BROADCAST_WRITES(1);
471 
472    if (se >= 0) {
473       value |= S_030800_SE_INDEX(se);
474    } else {
475       value |= S_030800_SE_BROADCAST_WRITES(1);
476    }
477 
478    if (instance >= 0) {
479       value |= S_030800_INSTANCE_INDEX(instance);
480    } else {
481       value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
482    }
483 
484    radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
485 }
486 
487 static void
radv_emit_select(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,unsigned * selectors)488 radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count,
489                  unsigned *selectors)
490 {
491    struct ac_pc_block_base *regs = block->b->b;
492    struct radeon_cmdbuf *cs = cmd_buffer->cs;
493    unsigned idx;
494 
495    assert(count <= regs->num_counters);
496 
497    /* Fake counters. */
498    if (!regs->select0)
499       return;
500 
501    for (idx = 0; idx < count; ++idx) {
502       radeon_set_perfctr_reg(cmd_buffer, regs->select0[idx],
503                              G_REG_SEL(selectors[idx]) | regs->select_or);
504    }
505 
506    for (idx = 0; idx < regs->num_spm_counters; idx++) {
507       radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1);
508       radeon_emit(cs, 0);
509    }
510 }
511 
512 static void
radv_pc_emit_block_instance_read(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,uint64_t va)513 radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block,
514                                  unsigned count, uint64_t va)
515 {
516    struct ac_pc_block_base *regs = block->b->b;
517    struct radeon_cmdbuf *cs = cmd_buffer->cs;
518    unsigned reg = regs->counter0_lo;
519    unsigned reg_delta = 8;
520 
521    assert(regs->select0);
522    for (unsigned idx = 0; idx < count; ++idx) {
523       if (regs->counters)
524          reg = regs->counters[idx];
525 
526       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
527       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
528                          COPY_DATA_WR_CONFIRM | COPY_DATA_COUNT_SEL); /* 64 bits */
529       radeon_emit(cs, reg >> 2);
530       radeon_emit(cs, 0); /* unused */
531       radeon_emit(cs, va);
532       radeon_emit(cs, va >> 32);
533 
534       va += sizeof(uint64_t) * 2 *
535             radv_pc_get_num_instances(cmd_buffer->device->physical_device, block);
536       reg += reg_delta;
537    }
538 }
539 
540 static void
radv_pc_sample_block(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,uint64_t va)541 radv_pc_sample_block(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count,
542                      uint64_t va)
543 {
544    unsigned se_end = 1;
545    if (block->b->b->flags & AC_PC_BLOCK_SE)
546       se_end = cmd_buffer->device->physical_device->rad_info.max_se;
547 
548    for (unsigned se = 0; se < se_end; ++se) {
549       for (unsigned instance = 0; instance < block->num_instances; ++instance) {
550          radv_emit_instance(cmd_buffer, se, instance);
551          radv_pc_emit_block_instance_read(cmd_buffer, block, count, va);
552          va += sizeof(uint64_t) * 2;
553       }
554    }
555 }
556 
557 static void
radv_pc_wait_idle(struct radv_cmd_buffer * cmd_buffer)558 radv_pc_wait_idle(struct radv_cmd_buffer *cmd_buffer)
559 {
560    struct radeon_cmdbuf *cs = cmd_buffer->cs;
561 
562    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
563    radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
564 
565    radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
566    radeon_emit(cs, 0);          /* CP_COHER_CNTL */
567    radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
568    radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
569    radeon_emit(cs, 0);          /* CP_COHER_BASE */
570    radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
571    radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
572    radeon_emit(cs, 0);          /* GCR_CNTL */
573 
574    radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
575    radeon_emit(cs, 0);
576 }
577 
578 static void
radv_pc_stop_and_sample(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va,bool end)579 radv_pc_stop_and_sample(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool,
580                         uint64_t va, bool end)
581 {
582    struct radeon_cmdbuf *cs = cmd_buffer->cs;
583    struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
584 
585    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
586    radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
587 
588    radv_pc_wait_idle(cmd_buffer);
589 
590    radv_emit_instance(cmd_buffer, -1, -1);
591    radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, false);
592 
593    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
594                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
595                              S_036020_PERFMON_SAMPLE_ENABLE(1));
596 
597    for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
598       uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) +
599                          PERF_CTR_BO_PASS_OFFSET + 8 * pass;
600       uint64_t reg_va = va + (end ? 8 : 0);
601 
602       radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
603       radeon_emit(cs, pred_va);
604       radeon_emit(cs, pred_va >> 32);
605       radeon_emit(cs, 0); /* Cache policy */
606 
607       uint32_t *skip_dwords = cs->buf + cs->cdw;
608       radeon_emit(cs, 0);
609 
610       for (unsigned i = 0; i < pool->num_pc_regs;) {
611          enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
612          struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
613          unsigned offset = ac_block->num_instances * pass;
614          unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
615 
616          unsigned cnt = 1;
617          while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
618             ++cnt;
619 
620          if (offset < cnt) {
621             unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
622             radv_pc_sample_block(cmd_buffer, ac_block, pass_reg_cnt,
623                                  reg_va + offset * num_instances * sizeof(uint64_t));
624          }
625 
626          i += cnt;
627          reg_va += num_instances * sizeof(uint64_t) * 2 * cnt;
628       }
629 
630       if (end) {
631          uint64_t signal_va = va + pool->b.stride - 8 - 8 * pass;
632          radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
633          radeon_emit(cs,
634                      S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
635          radeon_emit(cs, signal_va);
636          radeon_emit(cs, signal_va >> 32);
637          radeon_emit(cs, 1); /* value */
638       }
639 
640       *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
641    }
642 
643    radv_emit_instance(cmd_buffer, -1, -1);
644 }
645 
646 void
radv_pc_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va)647 radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool,
648                     uint64_t va)
649 {
650    struct radeon_cmdbuf *cs = cmd_buffer->cs;
651    struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
652    ASSERTED unsigned cdw_max;
653 
654    cmd_buffer->state.uses_perf_counters = true;
655 
656    cdw_max = radeon_check_space(cmd_buffer->device->ws, cs,
657                                 256 +                      /* Random one time stuff */
658                                    10 * pool->num_passes + /* COND_EXECs */
659                                    pool->b.stride / 8 * (5 + 8));
660 
661    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
662    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
663 
664    uint64_t perf_ctr_va =
665       radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
666    radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
667    radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
668    radeon_emit(cs, perf_ctr_va);
669    radeon_emit(cs, perf_ctr_va >> 32);
670    radeon_emit(cs, 0); /* value */
671 
672    radv_pc_wait_idle(cmd_buffer);
673 
674    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
675                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
676 
677    radv_emit_inhibit_clockgating(cmd_buffer->device, cs, true);
678    radv_emit_spi_config_cntl(cmd_buffer->device, cs, true);
679    radv_perfcounter_emit_shaders(cs, 0x7f);
680 
681    for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
682       uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) +
683                          PERF_CTR_BO_PASS_OFFSET + 8 * pass;
684 
685       radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
686       radeon_emit(cs, pred_va);
687       radeon_emit(cs, pred_va >> 32);
688       radeon_emit(cs, 0); /* Cache policy */
689 
690       uint32_t *skip_dwords = cs->buf + cs->cdw;
691       radeon_emit(cs, 0);
692 
693       for (unsigned i = 0; i < pool->num_pc_regs;) {
694          enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
695          struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
696          unsigned offset = ac_block->num_instances * pass;
697 
698          unsigned cnt = 1;
699          while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
700             ++cnt;
701 
702          if (offset < cnt) {
703             unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
704             radv_emit_select(cmd_buffer, ac_block, pass_reg_cnt, pool->pc_regs + i + offset);
705          }
706 
707          i += cnt;
708       }
709 
710       *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
711    }
712 
713    radv_emit_instance(cmd_buffer, -1, -1);
714 
715    /* The following sequence actually starts the perfcounters. */
716 
717    radv_pc_stop_and_sample(cmd_buffer, pool, va, false);
718 
719    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
720                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
721 
722    radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, true);
723 
724    assert(cmd_buffer->cs->cdw <= cdw_max);
725 }
726 
727 void
radv_pc_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va)728 radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va)
729 {
730    struct radeon_cmdbuf *cs = cmd_buffer->cs;
731    ASSERTED unsigned cdw_max;
732 
733    cdw_max =
734       radeon_check_space(cmd_buffer->device->ws, cs,
735                          256 + /* Reserved for things that don't scale with passes/counters */
736                             5 * pool->num_passes + /* COND_EXECs */
737                             pool->b.stride / 8 * 8);
738 
739    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
740    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
741 
742    uint64_t perf_ctr_va =
743       radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
744    si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
745                               radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0,
746                               EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, perf_ctr_va, 1,
747                               cmd_buffer->gfx9_fence_va);
748    radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, perf_ctr_va, 1, 0xffffffff);
749 
750    radv_pc_wait_idle(cmd_buffer);
751    radv_pc_stop_and_sample(cmd_buffer, pool, va, true);
752 
753    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
754                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
755    radv_emit_spi_config_cntl(cmd_buffer->device, cs, false);
756    radv_emit_inhibit_clockgating(cmd_buffer->device, cs, false);
757 
758    assert(cmd_buffer->cs->cdw <= cdw_max);
759 }
760 
761 static uint64_t
radv_pc_sum_reg(uint32_t reg,const uint64_t * data)762 radv_pc_sum_reg(uint32_t reg, const uint64_t *data)
763 {
764    unsigned instances = G_REG_INSTANCES(reg);
765    unsigned offset = G_REG_OFFSET(reg) / 8;
766    uint64_t result = 0;
767 
768    if (G_REG_CONSTANT(reg))
769       return reg & 0x7fffffffu;
770 
771    for (unsigned i = 0; i < instances; ++i) {
772       result += data[offset + 2 * i + 1] - data[offset + 2 * i];
773    }
774 
775    return result;
776 }
777 
778 static uint64_t
radv_pc_max_reg(uint32_t reg,const uint64_t * data)779 radv_pc_max_reg(uint32_t reg, const uint64_t *data)
780 {
781    unsigned instances = G_REG_INSTANCES(reg);
782    unsigned offset = G_REG_OFFSET(reg) / 8;
783    uint64_t result = 0;
784 
785    if (G_REG_CONSTANT(reg))
786       return reg & 0x7fffffffu;
787 
788    for (unsigned i = 0; i < instances; ++i) {
789       result = MAX2(result, data[offset + 2 * i + 1]);
790    }
791 
792    return result;
793 }
794 
795 static union VkPerformanceCounterResultKHR
radv_pc_get_result(const struct radv_perfcounter_impl * impl,const uint64_t * data)796 radv_pc_get_result(const struct radv_perfcounter_impl *impl, const uint64_t *data)
797 {
798    union VkPerformanceCounterResultKHR result;
799 
800    switch (impl->op) {
801    case RADV_PC_OP_MAX:
802       result.float64 = radv_pc_max_reg(impl->regs[0], data);
803       break;
804    case RADV_PC_OP_SUM:
805       result.float64 = radv_pc_sum_reg(impl->regs[0], data);
806       break;
807    case RADV_PC_OP_RATIO_DIVSCALE:
808       result.float64 = radv_pc_sum_reg(impl->regs[0], data) /
809                        (double)radv_pc_sum_reg(impl->regs[1], data) /
810                        radv_pc_sum_reg(impl->regs[2], data) * 100.0;
811       break;
812    case RADV_PC_OP_REVERSE_RATIO: {
813       double tmp = radv_pc_sum_reg(impl->regs[1], data);
814       result.float64 = (tmp - radv_pc_sum_reg(impl->regs[0], data)) / tmp * 100.0;
815       break;
816    }
817    case RADV_PC_OP_SUM_WEIGHTED_4:
818       result.float64 = 0.0;
819       for (unsigned i = 0; i < 4; ++i)
820          result.float64 +=
821             radv_pc_sum_reg(impl->regs[2 * i], data) * radv_pc_sum_reg(impl->regs[2 * i + 1], data);
822       break;
823    default:
824       unreachable("unhandled performance counter operation");
825    }
826    return result;
827 }
828 
829 void
radv_pc_get_results(const struct radv_pc_query_pool * pc_pool,const uint64_t * data,void * out)830 radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out)
831 {
832    union VkPerformanceCounterResultKHR *pc_result = out;
833 
834    for (unsigned i = 0; i < pc_pool->num_counters; ++i) {
835       pc_result[i] = radv_pc_get_result(pc_pool->counters + i, data);
836    }
837 }
838 
839 VkResult
radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)840 radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
841    VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount,
842    VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
843 {
844    RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
845 
846    if (vk_queue_to_radv(pdevice, queueFamilyIndex) != RADV_QUEUE_GENERAL) {
847       *pCounterCount = 0;
848       return VK_SUCCESS;
849    }
850 
851    if (!radv_init_perfcounter_descs(pdevice))
852       return VK_ERROR_OUT_OF_HOST_MEMORY;
853 
854    uint32_t counter_cnt = pdevice->num_perfcounters;
855    const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
856 
857    if (!pCounters && !pCounterDescriptions) {
858       *pCounterCount = counter_cnt;
859       return VK_SUCCESS;
860    }
861 
862    VkResult result = counter_cnt > *pCounterCount ? VK_INCOMPLETE : VK_SUCCESS;
863    counter_cnt = MIN2(counter_cnt, *pCounterCount);
864    *pCounterCount = counter_cnt;
865 
866    for (uint32_t i = 0; i < counter_cnt; ++i) {
867       if (pCounters) {
868          pCounters[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_KHR;
869          pCounters[i].unit = descs[i].unit;
870          pCounters[i].scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
871          pCounters[i].storage = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR;
872 
873          memset(&pCounters[i].uuid, 0, sizeof(pCounters[i].uuid));
874          strcpy((char*)&pCounters[i].uuid, "RADV");
875 
876          const uint32_t uuid = descs[i].uuid;
877          memcpy(&pCounters[i].uuid[12], &uuid, sizeof(uuid));
878       }
879 
880       if (pCounterDescriptions) {
881          pCounterDescriptions[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_DESCRIPTION_KHR;
882          pCounterDescriptions[i].flags =
883             VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR;
884          strcpy(pCounterDescriptions[i].name, descs[i].name);
885          strcpy(pCounterDescriptions[i].category, descs[i].category);
886          strcpy(pCounterDescriptions[i].description, descs[i].description);
887       }
888    }
889    return result;
890 }
891 
892 void
radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)893 radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
894    VkPhysicalDevice physicalDevice,
895    const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, uint32_t *pNumPasses)
896 {
897    RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
898 
899    if (pPerformanceQueryCreateInfo->counterIndexCount == 0) {
900       *pNumPasses = 0;
901       return;
902    }
903 
904    if (!radv_init_perfcounter_descs(pdevice)) {
905       /* Can't return an error, so log */
906       fprintf(stderr, "radv: Failed to init perf counters\n");
907       *pNumPasses = 1;
908       return;
909    }
910 
911    assert(vk_queue_to_radv(pdevice, pPerformanceQueryCreateInfo->queueFamilyIndex) ==
912           RADV_QUEUE_GENERAL);
913 
914    unsigned num_regs = 0;
915    uint32_t *regs = NULL;
916    VkResult result =
917       radv_get_counter_registers(pdevice, pPerformanceQueryCreateInfo->counterIndexCount,
918                                  pPerformanceQueryCreateInfo->pCounterIndices, &num_regs, &regs);
919    if (result != VK_SUCCESS) {
920       /* Can't return an error, so log */
921       fprintf(stderr, "radv: Failed to allocate memory for perf counters\n");
922    }
923 
924    *pNumPasses = radv_get_num_counter_passes(pdevice, num_regs, regs);
925    free(regs);
926 }
927