1 /*
2 * Copyright © 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "ac_perfcounter.h"
27 #include "amdgfxregs.h"
28 #include "radv_cs.h"
29 #include "radv_private.h"
30 #include "sid.h"
31
32 void
radv_perfcounter_emit_shaders(struct radv_device * device,struct radeon_cmdbuf * cs,unsigned shaders)33 radv_perfcounter_emit_shaders(struct radv_device *device, struct radeon_cmdbuf *cs, unsigned shaders)
34 {
35 if (device->physical_device->rad_info.gfx_level >= GFX11) {
36 radeon_set_uconfig_reg(cs, R_036760_SQG_PERFCOUNTER_CTRL, shaders & 0x7f);
37 } else {
38 radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
39 radeon_emit(cs, shaders & 0x7f);
40 radeon_emit(cs, 0xffffffff);
41 }
42 }
43
44 static void
radv_emit_windowed_counters(struct radv_device * device,struct radeon_cmdbuf * cs,int family,bool enable)45 radv_emit_windowed_counters(struct radv_device *device, struct radeon_cmdbuf *cs, int family, bool enable)
46 {
47 if (family == RADV_QUEUE_GENERAL) {
48 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
49 radeon_emit(cs, EVENT_TYPE(enable ? V_028A90_PERFCOUNTER_START : V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
50 }
51
52 radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(enable));
53 }
54
55 void
radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf * cs)56 radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs)
57 {
58 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
59 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
60 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
61 }
62
63 void
radv_perfcounter_emit_spm_start(struct radv_device * device,struct radeon_cmdbuf * cs,int family)64 radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
65 {
66 /* Start SPM counters. */
67 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
68 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
69 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
70
71 radv_emit_windowed_counters(device, cs, family, true);
72 }
73
74 void
radv_perfcounter_emit_spm_stop(struct radv_device * device,struct radeon_cmdbuf * cs,int family)75 radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
76 {
77 radv_emit_windowed_counters(device, cs, family, false);
78
79 /* Stop SPM counters. */
80 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
81 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
82 S_036020_SPM_PERFMON_STATE(device->physical_device->rad_info.never_stop_sq_perf_counters
83 ? V_036020_STRM_PERFMON_STATE_START_COUNTING
84 : V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
85 }
86
87 enum radv_perfcounter_op {
88 RADV_PC_OP_SUM,
89 RADV_PC_OP_MAX,
90 RADV_PC_OP_RATIO_DIVSCALE,
91 RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */
92 RADV_PC_OP_SUM_WEIGHTED_4,
93 };
94
95 #define S_REG_SEL(x) ((x)&0xFFFF)
96 #define G_REG_SEL(x) ((x)&0xFFFF)
97 #define S_REG_BLOCK(x) ((x) << 16)
98 #define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF)
99
100 #define S_REG_OFFSET(x) ((x)&0xFFFF)
101 #define G_REG_OFFSET(x) ((x)&0xFFFF)
102 #define S_REG_INSTANCES(x) ((x) << 16)
103 #define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF)
104 #define S_REG_CONSTANT(x) ((x) << 31)
105 #define G_REG_CONSTANT(x) ((x) >> 31)
106
107 struct radv_perfcounter_impl {
108 enum radv_perfcounter_op op;
109 uint32_t regs[8];
110 };
111
112 /* Only append to this list, never insert into the middle or remove (but can rename).
113 *
114 * The invariant we're trying to get here is counters that have the same meaning, so
115 * these can be shared between counters that have different implementations on different
116 * GPUs, but should be unique within a GPU.
117 */
118 enum radv_perfcounter_uuid {
119 RADV_PC_UUID_GPU_CYCLES,
120 RADV_PC_UUID_SHADER_WAVES,
121 RADV_PC_UUID_SHADER_INSTRUCTIONS,
122 RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU,
123 RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU,
124 RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD,
125 RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD,
126 RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE,
127 RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS,
128 RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS,
129 RADV_PC_UUID_SHADER_VALU_BUSY,
130 RADV_PC_UUID_SHADER_SALU_BUSY,
131 RADV_PC_UUID_VRAM_READ_SIZE,
132 RADV_PC_UUID_VRAM_WRITE_SIZE,
133 RADV_PC_UUID_L0_CACHE_HIT_RATIO,
134 RADV_PC_UUID_L1_CACHE_HIT_RATIO,
135 RADV_PC_UUID_L2_CACHE_HIT_RATIO,
136 };
137
138 struct radv_perfcounter_desc {
139 struct radv_perfcounter_impl impl;
140
141 VkPerformanceCounterUnitKHR unit;
142
143 char name[VK_MAX_DESCRIPTION_SIZE];
144 char category[VK_MAX_DESCRIPTION_SIZE];
145 char description[VK_MAX_DESCRIPTION_SIZE];
146 enum radv_perfcounter_uuid uuid;
147 };
148
149 #define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...) \
150 (struct radv_perfcounter_desc) \
151 { \
152 .impl = {.op = arg_op, .regs = {__VA_ARGS__}}, .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR, \
153 .name = arg_name, .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid \
154 }
155
156 #define ADD_PC(op, unit, name, category, description, uuid, ...) \
157 do { \
158 if (descs) { \
159 descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__); \
160 } \
161 ++*count; \
162 } while (0)
163 #define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr))
164 #define CONSTANT(v) (S_REG_CONSTANT(1) | (uint32_t)(v))
165
166 enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) };
167
168 enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) };
169
170 enum {
171 GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe),
172 GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12),
173 };
174
175 enum {
176 GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3),
177
178 GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23),
179 GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b),
180 GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c),
181 GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59),
182 GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a),
183 GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b),
184 GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c),
185
186 GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b),
187 GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53),
188 GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55),
189 GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63),
190 GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64),
191 GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65),
192 GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66),
193 };
194
195 enum {
196 SQ_PERF_SEL_WAVES = CTR(SQ, 0x4),
197 SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31),
198 SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37),
199 SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b),
200 SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c),
201 SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d),
202 SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40),
203 SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45),
204 SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46),
205 SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75),
206 };
207
208 enum {
209 TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9),
210 TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12),
211 };
212
213 #define CTR_NUM_SIMD CONSTANT(pdev->rad_info.num_simd_per_compute_unit * pdev->rad_info.num_cu)
214 #define CTR_NUM_CUS CONSTANT(pdev->rad_info.num_cu)
215
216 static void
radv_query_perfcounter_descs(struct radv_physical_device * pdev,uint32_t * count,struct radv_perfcounter_desc * descs)217 radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count, struct radv_perfcounter_desc *descs)
218 {
219 *count = 0;
220
221 ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM", "cycles the GPU is active processing a command buffer.",
222 GPU_CYCLES, GRBM_PERF_SEL_GUI_ACTIVE);
223
224 ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES, SQ_PERF_SEL_WAVES);
225 ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed", SHADER_INSTRUCTIONS,
226 SQ_PERF_SEL_INSTS_ALL_GFX10);
227 ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders", "Number of VALU Instructions executed",
228 SHADER_INSTRUCTIONS_VALU, SQ_PERF_SEL_INSTS_VALU_GFX10);
229 ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders", "Number of SALU Instructions executed",
230 SHADER_INSTRUCTIONS_SALU, SQ_PERF_SEL_INSTS_SALU_GFX10);
231 ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders", "Number of VMEM load instructions executed",
232 SHADER_INSTRUCTIONS_VMEM_LOAD, SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10);
233 ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders", "Number of SMEM load instructions executed",
234 SHADER_INSTRUCTIONS_SMEM_LOAD, SQ_PERF_SEL_INSTS_SMEM_GFX10);
235 ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders", "Number of VMEM store instructions executed",
236 SHADER_INSTRUCTIONS_VMEM_STORE, SQ_PERF_SEL_INSTS_TEX_STORE_GFX10);
237 ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders", "Number of LDS Instructions executed",
238 SHADER_INSTRUCTIONS_LDS, SQ_PERF_SEL_INSTS_LDS_GFX10);
239 ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders", "Number of GDS Instructions executed",
240 SHADER_INSTRUCTIONS_GDS, SQ_PERF_SEL_INSTS_GDS_GFX10);
241
242 ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization",
243 "Percentage of time the VALU units are busy", SHADER_VALU_BUSY, SQ_PERF_SEL_INST_CYCLES_VALU_GFX10,
244 CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD);
245 ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization",
246 "Percentage of time the SALU units are busy", SHADER_SALU_BUSY, SQ_PERF_SEL_INSTS_SALU_GFX10,
247 CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS);
248
249 if (pdev->rad_info.gfx_level >= GFX10_3) {
250 ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", "Number of bytes read from VRAM",
251 VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103, CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103,
252 CONSTANT(64), GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103,
253 CONSTANT(128));
254 ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", "Number of bytes written to VRAM",
255 VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103, CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103,
256 CONSTANT(64), CONSTANT(0), CONSTANT(0), CONSTANT(0), CONSTANT(0));
257 } else {
258 ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", "Number of bytes read from VRAM",
259 VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101, CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101,
260 CONSTANT(64), GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101,
261 CONSTANT(128));
262 ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", "Number of bytes written to VRAM",
263 VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101, CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101,
264 CONSTANT(32), CONSTANT(0), CONSTANT(0), CONSTANT(0), CONSTANT(0));
265 }
266
267 ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache", L0_CACHE_HIT_RATIO,
268 TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10);
269 ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache", L1_CACHE_HIT_RATIO,
270 GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ);
271 if (pdev->rad_info.gfx_level >= GFX10_3) {
272 ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", "Hit ratio of L2 cache",
273 L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103, GL2C_PERF_SEL_REQ);
274 } else {
275 ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", "Hit ratio of L2 cache",
276 L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101, GL2C_PERF_SEL_REQ);
277 }
278 }
279
280 static bool
radv_init_perfcounter_descs(struct radv_physical_device * pdev)281 radv_init_perfcounter_descs(struct radv_physical_device *pdev)
282 {
283 if (pdev->perfcounters)
284 return true;
285
286 uint32_t count;
287 radv_query_perfcounter_descs(pdev, &count, NULL);
288
289 struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count);
290 if (!descs)
291 return false;
292
293 radv_query_perfcounter_descs(pdev, &count, descs);
294 pdev->num_perfcounters = count;
295 pdev->perfcounters = descs;
296
297 return true;
298 }
299
300 static int
cmp_uint32_t(const void * a,const void * b)301 cmp_uint32_t(const void *a, const void *b)
302 {
303 uint32_t l = *(const uint32_t *)a;
304 uint32_t r = *(const uint32_t *)b;
305
306 return (l < r) ? -1 : (l > r) ? 1 : 0;
307 }
308
309 static VkResult
radv_get_counter_registers(const struct radv_physical_device * pdevice,uint32_t num_indices,const uint32_t * indices,unsigned * out_num_regs,uint32_t ** out_regs)310 radv_get_counter_registers(const struct radv_physical_device *pdevice, uint32_t num_indices, const uint32_t *indices,
311 unsigned *out_num_regs, uint32_t **out_regs)
312 {
313 ASSERTED uint32_t num_counters = pdevice->num_perfcounters;
314 const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
315
316 unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs);
317 uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t));
318 if (!regs)
319 return VK_ERROR_OUT_OF_HOST_MEMORY;
320
321 unsigned reg_cnt = 0;
322 for (unsigned i = 0; i < num_indices; ++i) {
323 uint32_t index = indices[i];
324 assert(index < num_counters);
325 for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j]; ++j) {
326 if (!G_REG_CONSTANT(descs[index].impl.regs[j]))
327 regs[reg_cnt++] = descs[index].impl.regs[j];
328 }
329 }
330
331 qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t);
332
333 unsigned deduped_reg_cnt = 0;
334 for (unsigned i = 1; i < reg_cnt; ++i) {
335 if (regs[i] != regs[deduped_reg_cnt])
336 regs[++deduped_reg_cnt] = regs[i];
337 }
338 ++deduped_reg_cnt;
339
340 *out_num_regs = deduped_reg_cnt;
341 *out_regs = regs;
342 return VK_SUCCESS;
343 }
344
345 static unsigned
radv_pc_get_num_instances(const struct radv_physical_device * pdevice,struct ac_pc_block * ac_block)346 radv_pc_get_num_instances(const struct radv_physical_device *pdevice, struct ac_pc_block *ac_block)
347 {
348 return ac_block->num_instances * ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdevice->rad_info.max_se : 1);
349 }
350
351 static unsigned
radv_get_num_counter_passes(const struct radv_physical_device * pdevice,unsigned num_regs,const uint32_t * regs)352 radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned num_regs, const uint32_t *regs)
353 {
354 enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK;
355 unsigned block_reg_count = 0;
356 struct ac_pc_block *ac_block = NULL;
357 unsigned passes_needed = 1;
358
359 for (unsigned i = 0; i < num_regs; ++i) {
360 enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]);
361
362 if (block != prev_block) {
363 block_reg_count = 0;
364 prev_block = block;
365 ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
366 }
367
368 ++block_reg_count;
369
370 passes_needed = MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters));
371 }
372
373 return passes_needed;
374 }
375
376 void
radv_pc_deinit_query_pool(struct radv_pc_query_pool * pool)377 radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool)
378 {
379 free(pool->counters);
380 free(pool->pc_regs);
381 }
382
383 VkResult
radv_pc_init_query_pool(struct radv_physical_device * pdevice,const VkQueryPoolCreateInfo * pCreateInfo,struct radv_pc_query_pool * pool)384 radv_pc_init_query_pool(struct radv_physical_device *pdevice, const VkQueryPoolCreateInfo *pCreateInfo,
385 struct radv_pc_query_pool *pool)
386 {
387 const VkQueryPoolPerformanceCreateInfoKHR *perf_info =
388 vk_find_struct_const(pCreateInfo->pNext, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
389 VkResult result;
390
391 if (!radv_init_perfcounter_descs(pdevice))
392 return VK_ERROR_OUT_OF_HOST_MEMORY;
393
394 result = radv_get_counter_registers(pdevice, perf_info->counterIndexCount, perf_info->pCounterIndices,
395 &pool->num_pc_regs, &pool->pc_regs);
396 if (result != VK_SUCCESS)
397 return result;
398
399 pool->num_passes = radv_get_num_counter_passes(pdevice, pool->num_pc_regs, pool->pc_regs);
400
401 uint32_t *pc_reg_offsets = malloc(pool->num_pc_regs * sizeof(uint32_t));
402 if (!pc_reg_offsets)
403 return VK_ERROR_OUT_OF_HOST_MEMORY;
404
405 unsigned offset = 0;
406 for (unsigned i = 0; i < pool->num_pc_regs; ++i) {
407 enum ac_pc_gpu_block block = pool->pc_regs[i] >> 16;
408 struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
409 unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
410
411 pc_reg_offsets[i] = S_REG_OFFSET(offset) | S_REG_INSTANCES(num_instances);
412 offset += sizeof(uint64_t) * 2 * num_instances;
413 }
414
415 /* allow an uint32_t per pass to signal completion. */
416 pool->b.stride = offset + 8 * pool->num_passes;
417
418 pool->num_counters = perf_info->counterIndexCount;
419 pool->counters = malloc(pool->num_counters * sizeof(struct radv_perfcounter_impl));
420 if (!pool->counters) {
421 free(pc_reg_offsets);
422 return VK_ERROR_OUT_OF_HOST_MEMORY;
423 }
424
425 for (unsigned i = 0; i < pool->num_counters; ++i) {
426 pool->counters[i] = pdevice->perfcounters[perf_info->pCounterIndices[i]].impl;
427
428 for (unsigned j = 0; j < ARRAY_SIZE(pool->counters[i].regs); ++j) {
429 uint32_t reg = pool->counters[i].regs[j];
430 if (!reg || G_REG_CONSTANT(reg))
431 continue;
432
433 unsigned k;
434 for (k = 0; k < pool->num_pc_regs; ++k)
435 if (pool->pc_regs[k] == reg)
436 break;
437 pool->counters[i].regs[j] = pc_reg_offsets[k];
438 }
439 }
440
441 free(pc_reg_offsets);
442 return VK_SUCCESS;
443 }
444
445 static void
radv_emit_instance(struct radv_cmd_buffer * cmd_buffer,int se,int instance)446 radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance)
447 {
448 struct radeon_cmdbuf *cs = cmd_buffer->cs;
449 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
450
451 if (se >= 0) {
452 value |= S_030800_SE_INDEX(se);
453 } else {
454 value |= S_030800_SE_BROADCAST_WRITES(1);
455 }
456
457 if (instance >= 0) {
458 value |= S_030800_INSTANCE_INDEX(instance);
459 } else {
460 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
461 }
462
463 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
464 }
465
466 static void
radv_emit_select(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,unsigned * selectors)467 radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, unsigned *selectors)
468 {
469 const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level;
470 const enum radv_queue_family qf = cmd_buffer->qf;
471 struct ac_pc_block_base *regs = block->b->b;
472 struct radeon_cmdbuf *cs = cmd_buffer->cs;
473 unsigned idx;
474
475 assert(count <= regs->num_counters);
476
477 /* Fake counters. */
478 if (!regs->select0)
479 return;
480
481 for (idx = 0; idx < count; ++idx) {
482 radeon_set_perfctr_reg(gfx_level, qf, cs, regs->select0[idx], G_REG_SEL(selectors[idx]) | regs->select_or);
483 }
484
485 for (idx = 0; idx < regs->num_spm_counters; idx++) {
486 radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1);
487 radeon_emit(cs, 0);
488 }
489 }
490
491 static void
radv_pc_emit_block_instance_read(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,uint64_t va)492 radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count,
493 uint64_t va)
494 {
495 struct ac_pc_block_base *regs = block->b->b;
496 struct radeon_cmdbuf *cs = cmd_buffer->cs;
497 unsigned reg = regs->counter0_lo;
498 unsigned reg_delta = 8;
499
500 assert(regs->select0);
501 for (unsigned idx = 0; idx < count; ++idx) {
502 if (regs->counters)
503 reg = regs->counters[idx];
504
505 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
506 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM |
507 COPY_DATA_COUNT_SEL); /* 64 bits */
508 radeon_emit(cs, reg >> 2);
509 radeon_emit(cs, 0); /* unused */
510 radeon_emit(cs, va);
511 radeon_emit(cs, va >> 32);
512
513 va += sizeof(uint64_t) * 2 * radv_pc_get_num_instances(cmd_buffer->device->physical_device, block);
514 reg += reg_delta;
515 }
516 }
517
518 static void
radv_pc_sample_block(struct radv_cmd_buffer * cmd_buffer,struct ac_pc_block * block,unsigned count,uint64_t va)519 radv_pc_sample_block(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, uint64_t va)
520 {
521 unsigned se_end = 1;
522 if (block->b->b->flags & AC_PC_BLOCK_SE)
523 se_end = cmd_buffer->device->physical_device->rad_info.max_se;
524
525 for (unsigned se = 0; se < se_end; ++se) {
526 for (unsigned instance = 0; instance < block->num_instances; ++instance) {
527 radv_emit_instance(cmd_buffer, se, instance);
528 radv_pc_emit_block_instance_read(cmd_buffer, block, count, va);
529 va += sizeof(uint64_t) * 2;
530 }
531 }
532 }
533
534 static void
radv_pc_wait_idle(struct radv_cmd_buffer * cmd_buffer)535 radv_pc_wait_idle(struct radv_cmd_buffer *cmd_buffer)
536 {
537 struct radeon_cmdbuf *cs = cmd_buffer->cs;
538
539 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
540 radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
541
542 radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
543 radeon_emit(cs, 0); /* CP_COHER_CNTL */
544 radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
545 radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
546 radeon_emit(cs, 0); /* CP_COHER_BASE */
547 radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
548 radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
549 radeon_emit(cs, 0); /* GCR_CNTL */
550
551 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
552 radeon_emit(cs, 0);
553 }
554
555 static void
radv_pc_stop_and_sample(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va,bool end)556 radv_pc_stop_and_sample(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va, bool end)
557 {
558 struct radeon_cmdbuf *cs = cmd_buffer->cs;
559 struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
560
561 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
562 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
563
564 radv_pc_wait_idle(cmd_buffer);
565
566 radv_emit_instance(cmd_buffer, -1, -1);
567 radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, false);
568
569 radeon_set_uconfig_reg(
570 cs, R_036020_CP_PERFMON_CNTL,
571 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
572
573 for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
574 uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_PASS_OFFSET + 8 * pass;
575 uint64_t reg_va = va + (end ? 8 : 0);
576
577 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
578 radeon_emit(cs, pred_va);
579 radeon_emit(cs, pred_va >> 32);
580 radeon_emit(cs, 0); /* Cache policy */
581
582 uint32_t *skip_dwords = cs->buf + cs->cdw;
583 radeon_emit(cs, 0);
584
585 for (unsigned i = 0; i < pool->num_pc_regs;) {
586 enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
587 struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
588 unsigned offset = ac_block->num_instances * pass;
589 unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
590
591 unsigned cnt = 1;
592 while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
593 ++cnt;
594
595 if (offset < cnt) {
596 unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
597 radv_pc_sample_block(cmd_buffer, ac_block, pass_reg_cnt,
598 reg_va + offset * num_instances * sizeof(uint64_t));
599 }
600
601 i += cnt;
602 reg_va += num_instances * sizeof(uint64_t) * 2 * cnt;
603 }
604
605 if (end) {
606 uint64_t signal_va = va + pool->b.stride - 8 - 8 * pass;
607 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
608 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
609 radeon_emit(cs, signal_va);
610 radeon_emit(cs, signal_va >> 32);
611 radeon_emit(cs, 1); /* value */
612 }
613
614 *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
615 }
616
617 radv_emit_instance(cmd_buffer, -1, -1);
618 }
619
620 void
radv_pc_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va)621 radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va)
622 {
623 struct radeon_cmdbuf *cs = cmd_buffer->cs;
624 struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
625 ASSERTED unsigned cdw_max;
626
627 cmd_buffer->state.uses_perf_counters = true;
628
629 cdw_max = radeon_check_space(cmd_buffer->device->ws, cs,
630 256 + /* Random one time stuff */
631 10 * pool->num_passes + /* COND_EXECs */
632 pool->b.stride / 8 * (5 + 8));
633
634 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
635 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
636
637 uint64_t perf_ctr_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
638 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
639 radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
640 radeon_emit(cs, perf_ctr_va);
641 radeon_emit(cs, perf_ctr_va >> 32);
642 radeon_emit(cs, 0); /* value */
643
644 radv_pc_wait_idle(cmd_buffer);
645
646 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
647 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
648
649 radv_emit_inhibit_clockgating(cmd_buffer->device, cs, true);
650 radv_emit_spi_config_cntl(cmd_buffer->device, cs, true);
651 radv_perfcounter_emit_shaders(cmd_buffer->device, cs, 0x7f);
652
653 for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
654 uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_PASS_OFFSET + 8 * pass;
655
656 radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
657 radeon_emit(cs, pred_va);
658 radeon_emit(cs, pred_va >> 32);
659 radeon_emit(cs, 0); /* Cache policy */
660
661 uint32_t *skip_dwords = cs->buf + cs->cdw;
662 radeon_emit(cs, 0);
663
664 for (unsigned i = 0; i < pool->num_pc_regs;) {
665 enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
666 struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
667 unsigned offset = ac_block->num_instances * pass;
668
669 unsigned cnt = 1;
670 while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
671 ++cnt;
672
673 if (offset < cnt) {
674 unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
675 radv_emit_select(cmd_buffer, ac_block, pass_reg_cnt, pool->pc_regs + i + offset);
676 }
677
678 i += cnt;
679 }
680
681 *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
682 }
683
684 radv_emit_instance(cmd_buffer, -1, -1);
685
686 /* The following sequence actually starts the perfcounters. */
687
688 radv_pc_stop_and_sample(cmd_buffer, pool, va, false);
689
690 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
691 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
692
693 radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, true);
694
695 assert(cmd_buffer->cs->cdw <= cdw_max);
696 }
697
698 void
radv_pc_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_pc_query_pool * pool,uint64_t va)699 radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va)
700 {
701 struct radeon_cmdbuf *cs = cmd_buffer->cs;
702 ASSERTED unsigned cdw_max;
703
704 cdw_max = radeon_check_space(cmd_buffer->device->ws, cs,
705 256 + /* Reserved for things that don't scale with passes/counters */
706 5 * pool->num_passes + /* COND_EXECs */
707 pool->b.stride / 8 * 8);
708
709 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
710 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
711
712 uint64_t perf_ctr_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
713 radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
714 V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, perf_ctr_va,
715 1, cmd_buffer->gfx9_fence_va);
716 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, perf_ctr_va, 1, 0xffffffff);
717
718 radv_pc_wait_idle(cmd_buffer);
719 radv_pc_stop_and_sample(cmd_buffer, pool, va, true);
720
721 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
722 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
723 radv_emit_spi_config_cntl(cmd_buffer->device, cs, false);
724 radv_emit_inhibit_clockgating(cmd_buffer->device, cs, false);
725
726 assert(cmd_buffer->cs->cdw <= cdw_max);
727 }
728
729 static uint64_t
radv_pc_sum_reg(uint32_t reg,const uint64_t * data)730 radv_pc_sum_reg(uint32_t reg, const uint64_t *data)
731 {
732 unsigned instances = G_REG_INSTANCES(reg);
733 unsigned offset = G_REG_OFFSET(reg) / 8;
734 uint64_t result = 0;
735
736 if (G_REG_CONSTANT(reg))
737 return reg & 0x7fffffffu;
738
739 for (unsigned i = 0; i < instances; ++i) {
740 result += data[offset + 2 * i + 1] - data[offset + 2 * i];
741 }
742
743 return result;
744 }
745
746 static uint64_t
radv_pc_max_reg(uint32_t reg,const uint64_t * data)747 radv_pc_max_reg(uint32_t reg, const uint64_t *data)
748 {
749 unsigned instances = G_REG_INSTANCES(reg);
750 unsigned offset = G_REG_OFFSET(reg) / 8;
751 uint64_t result = 0;
752
753 if (G_REG_CONSTANT(reg))
754 return reg & 0x7fffffffu;
755
756 for (unsigned i = 0; i < instances; ++i) {
757 result = MAX2(result, data[offset + 2 * i + 1]);
758 }
759
760 return result;
761 }
762
763 static union VkPerformanceCounterResultKHR
radv_pc_get_result(const struct radv_perfcounter_impl * impl,const uint64_t * data)764 radv_pc_get_result(const struct radv_perfcounter_impl *impl, const uint64_t *data)
765 {
766 union VkPerformanceCounterResultKHR result;
767
768 switch (impl->op) {
769 case RADV_PC_OP_MAX:
770 result.float64 = radv_pc_max_reg(impl->regs[0], data);
771 break;
772 case RADV_PC_OP_SUM:
773 result.float64 = radv_pc_sum_reg(impl->regs[0], data);
774 break;
775 case RADV_PC_OP_RATIO_DIVSCALE:
776 result.float64 = radv_pc_sum_reg(impl->regs[0], data) / (double)radv_pc_sum_reg(impl->regs[1], data) /
777 radv_pc_sum_reg(impl->regs[2], data) * 100.0;
778 break;
779 case RADV_PC_OP_REVERSE_RATIO: {
780 double tmp = radv_pc_sum_reg(impl->regs[1], data);
781 result.float64 = (tmp - radv_pc_sum_reg(impl->regs[0], data)) / tmp * 100.0;
782 break;
783 }
784 case RADV_PC_OP_SUM_WEIGHTED_4:
785 result.float64 = 0.0;
786 for (unsigned i = 0; i < 4; ++i)
787 result.float64 += radv_pc_sum_reg(impl->regs[2 * i], data) * radv_pc_sum_reg(impl->regs[2 * i + 1], data);
788 break;
789 default:
790 unreachable("unhandled performance counter operation");
791 }
792 return result;
793 }
794
795 void
radv_pc_get_results(const struct radv_pc_query_pool * pc_pool,const uint64_t * data,void * out)796 radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out)
797 {
798 union VkPerformanceCounterResultKHR *pc_result = out;
799
800 for (unsigned i = 0; i < pc_pool->num_counters; ++i) {
801 pc_result[i] = radv_pc_get_result(pc_pool->counters + i, data);
802 }
803 }
804
805 VKAPI_ATTR VkResult VKAPI_CALL
radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)806 radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
807 VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount,
808 VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
809 {
810 RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
811
812 if (vk_queue_to_radv(pdevice, queueFamilyIndex) != RADV_QUEUE_GENERAL) {
813 *pCounterCount = 0;
814 return VK_SUCCESS;
815 }
816
817 if (!radv_init_perfcounter_descs(pdevice))
818 return VK_ERROR_OUT_OF_HOST_MEMORY;
819
820 uint32_t counter_cnt = pdevice->num_perfcounters;
821 const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
822
823 if (!pCounters && !pCounterDescriptions) {
824 *pCounterCount = counter_cnt;
825 return VK_SUCCESS;
826 }
827
828 VkResult result = counter_cnt > *pCounterCount ? VK_INCOMPLETE : VK_SUCCESS;
829 counter_cnt = MIN2(counter_cnt, *pCounterCount);
830 *pCounterCount = counter_cnt;
831
832 for (uint32_t i = 0; i < counter_cnt; ++i) {
833 if (pCounters) {
834 pCounters[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_KHR;
835 pCounters[i].unit = descs[i].unit;
836 pCounters[i].scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
837 pCounters[i].storage = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR;
838
839 memset(&pCounters[i].uuid, 0, sizeof(pCounters[i].uuid));
840 strcpy((char *)&pCounters[i].uuid, "RADV");
841
842 const uint32_t uuid = descs[i].uuid;
843 memcpy(&pCounters[i].uuid[12], &uuid, sizeof(uuid));
844 }
845
846 if (pCounterDescriptions) {
847 pCounterDescriptions[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_DESCRIPTION_KHR;
848 pCounterDescriptions[i].flags = VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR;
849 strcpy(pCounterDescriptions[i].name, descs[i].name);
850 strcpy(pCounterDescriptions[i].category, descs[i].category);
851 strcpy(pCounterDescriptions[i].description, descs[i].description);
852 }
853 }
854 return result;
855 }
856
857 VKAPI_ATTR void VKAPI_CALL
radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)858 radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
859 VkPhysicalDevice physicalDevice, const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
860 uint32_t *pNumPasses)
861 {
862 RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
863
864 if (pPerformanceQueryCreateInfo->counterIndexCount == 0) {
865 *pNumPasses = 0;
866 return;
867 }
868
869 if (!radv_init_perfcounter_descs(pdevice)) {
870 /* Can't return an error, so log */
871 fprintf(stderr, "radv: Failed to init perf counters\n");
872 *pNumPasses = 1;
873 return;
874 }
875
876 assert(vk_queue_to_radv(pdevice, pPerformanceQueryCreateInfo->queueFamilyIndex) == RADV_QUEUE_GENERAL);
877
878 unsigned num_regs = 0;
879 uint32_t *regs = NULL;
880 VkResult result = radv_get_counter_registers(pdevice, pPerformanceQueryCreateInfo->counterIndexCount,
881 pPerformanceQueryCreateInfo->pCounterIndices, &num_regs, ®s);
882 if (result != VK_SUCCESS) {
883 /* Can't return an error, so log */
884 fprintf(stderr, "radv: Failed to allocate memory for perf counters\n");
885 }
886
887 *pNumPasses = radv_get_num_counter_passes(pdevice, num_regs, regs);
888 free(regs);
889 }
890