1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "util/u_memory.h"
28
29 #include "ac_perfcounter.h"
30
31 struct si_query_group {
32 struct si_query_group *next;
33 struct ac_pc_block *block;
34 unsigned sub_gid; /* only used during init */
35 unsigned result_base; /* only used during init */
36 int se;
37 int instance;
38 unsigned num_counters;
39 unsigned selectors[AC_QUERY_MAX_COUNTERS];
40 };
41
42 struct si_query_counter {
43 unsigned base;
44 unsigned qwords;
45 unsigned stride; /* in uint64s */
46 };
47
48 struct si_query_pc {
49 struct si_query b;
50 struct si_query_buffer buffer;
51
52 /* Size of the results in memory, in bytes. */
53 unsigned result_size;
54
55 unsigned shaders;
56 unsigned num_counters;
57 struct si_query_counter *counters;
58 struct si_query_group *groups;
59 };
60
si_pc_emit_instance(struct si_context * sctx,int se,int instance)61 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
62 {
63 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
64 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
65
66 if (se >= 0) {
67 value |= S_030800_SE_INDEX(se);
68 } else {
69 value |= S_030800_SE_BROADCAST_WRITES(1);
70 }
71
72 if (sctx->gfx_level >= GFX10) {
73 /* TODO: Expose counters from each shader array separately if needed. */
74 value |= S_030800_SA_BROADCAST_WRITES(1);
75 }
76
77 if (instance >= 0) {
78 value |= S_030800_INSTANCE_INDEX(instance);
79 } else {
80 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
81 }
82
83 radeon_begin(cs);
84 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
85 radeon_end();
86 }
87
si_pc_emit_shaders(struct radeon_cmdbuf * cs,unsigned shaders)88 void si_pc_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
89 {
90 radeon_begin(cs);
91 radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
92 radeon_emit(shaders & 0x7f);
93 radeon_emit(0xffffffff);
94 radeon_end();
95 }
96
si_pc_emit_select(struct si_context * sctx,struct ac_pc_block * block,unsigned count,unsigned * selectors)97 static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
98 unsigned *selectors)
99 {
100 struct ac_pc_block_base *regs = block->b->b;
101 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
102 unsigned idx;
103
104 assert(count <= regs->num_counters);
105
106 /* Fake counters. */
107 if (!regs->select0)
108 return;
109
110 radeon_begin(cs);
111
112 for (idx = 0; idx < count; ++idx) {
113 radeon_set_uconfig_reg_seq(regs->select0[idx], 1, false);
114 radeon_emit(selectors[idx] | regs->select_or);
115 }
116
117 for (idx = 0; idx < regs->num_spm_counters; idx++) {
118 radeon_set_uconfig_reg_seq(regs->select1[idx], 1, false);
119 radeon_emit(0);
120 }
121
122 radeon_end();
123 }
124
si_pc_emit_start(struct si_context * sctx,struct si_resource * buffer,uint64_t va)125 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
126 {
127 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
128
129 si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
130 COPY_DATA_IMM, NULL, 1);
131
132 radeon_begin(cs);
133 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
134 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
135 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
136 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
137 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
138 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
139 radeon_end();
140 }
141
142 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
143 * do it again in here. */
si_pc_emit_stop(struct si_context * sctx,struct si_resource * buffer,uint64_t va)144 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
145 {
146 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
147
148 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
149 EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
150 si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
151
152 radeon_begin(cs);
153 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
154 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
155
156 if (!sctx->screen->info.never_send_perfcounter_stop) {
157 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
158 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
159 }
160
161 radeon_set_uconfig_reg(
162 R_036020_CP_PERFMON_CNTL,
163 S_036020_PERFMON_STATE(sctx->screen->info.never_stop_sq_perf_counters ?
164 V_036020_CP_PERFMON_STATE_START_COUNTING :
165 V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
166 S_036020_PERFMON_SAMPLE_ENABLE(1));
167 radeon_end();
168 }
169
si_pc_emit_spm_start(struct radeon_cmdbuf * cs)170 void si_pc_emit_spm_start(struct radeon_cmdbuf *cs)
171 {
172 radeon_begin(cs);
173
174 /* Start SPM counters. */
175 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
176 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
177 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
178 /* Start windowed performance counters. */
179 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
180 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
181 radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(1));
182
183 radeon_end();
184 }
185
si_pc_emit_spm_stop(struct radeon_cmdbuf * cs,bool never_stop_sq_perf_counters,bool never_send_perfcounter_stop)186 void si_pc_emit_spm_stop(struct radeon_cmdbuf *cs, bool never_stop_sq_perf_counters,
187 bool never_send_perfcounter_stop)
188 {
189 radeon_begin(cs);
190
191 /* Stop windowed performance counters. */
192 if (!never_send_perfcounter_stop) {
193 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
194 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
195 }
196
197 radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(0));
198
199 /* Stop SPM counters. */
200 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
201 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
202 S_036020_SPM_PERFMON_STATE(never_stop_sq_perf_counters ?
203 V_036020_STRM_PERFMON_STATE_START_COUNTING :
204 V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
205
206 radeon_end();
207 }
208
si_pc_emit_spm_reset(struct radeon_cmdbuf * cs)209 void si_pc_emit_spm_reset(struct radeon_cmdbuf *cs)
210 {
211 radeon_begin(cs);
212 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
213 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
214 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
215 radeon_end();
216 }
217
218
si_pc_emit_read(struct si_context * sctx,struct ac_pc_block * block,unsigned count,uint64_t va)219 static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
220 uint64_t va)
221 {
222 struct ac_pc_block_base *regs = block->b->b;
223 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
224 unsigned idx;
225 unsigned reg = regs->counter0_lo;
226 unsigned reg_delta = 8;
227
228 radeon_begin(cs);
229
230 if (regs->select0) {
231 for (idx = 0; idx < count; ++idx) {
232 if (regs->counters)
233 reg = regs->counters[idx];
234
235 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
236 radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
237 COPY_DATA_COUNT_SEL); /* 64 bits */
238 radeon_emit(reg >> 2);
239 radeon_emit(0); /* unused */
240 radeon_emit(va);
241 radeon_emit(va >> 32);
242 va += sizeof(uint64_t);
243 reg += reg_delta;
244 }
245 } else {
246 /* Fake counters. */
247 for (idx = 0; idx < count; ++idx) {
248 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
249 radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
250 COPY_DATA_COUNT_SEL);
251 radeon_emit(0); /* immediate */
252 radeon_emit(0);
253 radeon_emit(va);
254 radeon_emit(va >> 32);
255 va += sizeof(uint64_t);
256 }
257 }
258 radeon_end();
259 }
260
si_pc_query_destroy(struct si_context * sctx,struct si_query * squery)261 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
262 {
263 struct si_query_pc *query = (struct si_query_pc *)squery;
264
265 while (query->groups) {
266 struct si_query_group *group = query->groups;
267 query->groups = group->next;
268 FREE(group);
269 }
270
271 FREE(query->counters);
272
273 si_query_buffer_destroy(sctx->screen, &query->buffer);
274 FREE(query);
275 }
276
si_inhibit_clockgating(struct si_context * sctx,struct radeon_cmdbuf * cs,bool inhibit)277 void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit)
278 {
279 if (sctx->gfx_level >= GFX11)
280 return;
281
282 radeon_begin(&sctx->gfx_cs);
283
284 if (sctx->gfx_level >= GFX10) {
285 radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
286 S_037390_PERFMON_CLOCK_STATE(inhibit));
287 } else if (sctx->gfx_level >= GFX8) {
288 radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
289 S_0372FC_PERFMON_CLOCK_STATE(inhibit));
290 }
291 radeon_end();
292 }
293
si_pc_query_resume(struct si_context * sctx,struct si_query * squery)294 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
295 /*
296 struct si_query_hw *hwquery,
297 struct si_resource *buffer, uint64_t va)*/
298 {
299 struct si_query_pc *query = (struct si_query_pc *)squery;
300 int current_se = -1;
301 int current_instance = -1;
302
303 if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
304 return;
305 si_need_gfx_cs_space(sctx, 0);
306
307 if (query->shaders)
308 si_pc_emit_shaders(&sctx->gfx_cs, query->shaders);
309
310 si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
311
312 for (struct si_query_group *group = query->groups; group; group = group->next) {
313 struct ac_pc_block *block = group->block;
314
315 if (group->se != current_se || group->instance != current_instance) {
316 current_se = group->se;
317 current_instance = group->instance;
318 si_pc_emit_instance(sctx, group->se, group->instance);
319 }
320
321 si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
322 }
323
324 if (current_se != -1 || current_instance != -1)
325 si_pc_emit_instance(sctx, -1, -1);
326
327 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
328 si_pc_emit_start(sctx, query->buffer.buf, va);
329 }
330
si_pc_query_suspend(struct si_context * sctx,struct si_query * squery)331 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
332 {
333 struct si_query_pc *query = (struct si_query_pc *)squery;
334
335 if (!query->buffer.buf)
336 return;
337
338 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
339 query->buffer.results_end += query->result_size;
340
341 si_pc_emit_stop(sctx, query->buffer.buf, va);
342
343 for (struct si_query_group *group = query->groups; group; group = group->next) {
344 struct ac_pc_block *block = group->block;
345 unsigned se = group->se >= 0 ? group->se : 0;
346 unsigned se_end = se + 1;
347
348 if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
349 se_end = sctx->screen->info.max_se;
350
351 do {
352 unsigned instance = group->instance >= 0 ? group->instance : 0;
353
354 do {
355 si_pc_emit_instance(sctx, se, instance);
356 si_pc_emit_read(sctx, block, group->num_counters, va);
357 va += sizeof(uint64_t) * group->num_counters;
358 } while (group->instance < 0 && ++instance < block->num_instances);
359 } while (++se < se_end);
360 }
361
362 si_pc_emit_instance(sctx, -1, -1);
363
364 si_inhibit_clockgating(sctx, &sctx->gfx_cs, false);
365 }
366
si_pc_query_begin(struct si_context * ctx,struct si_query * squery)367 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
368 {
369 struct si_query_pc *query = (struct si_query_pc *)squery;
370
371 si_query_buffer_reset(ctx, &query->buffer);
372
373 list_addtail(&query->b.active_list, &ctx->active_queries);
374 ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
375
376 si_pc_query_resume(ctx, squery);
377
378 return true;
379 }
380
si_pc_query_end(struct si_context * ctx,struct si_query * squery)381 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
382 {
383 struct si_query_pc *query = (struct si_query_pc *)squery;
384
385 si_pc_query_suspend(ctx, squery);
386
387 list_del(&squery->active_list);
388 ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
389
390 return query->buffer.buf != NULL;
391 }
392
si_pc_query_add_result(struct si_query_pc * query,void * buffer,union pipe_query_result * result)393 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
394 union pipe_query_result *result)
395 {
396 uint64_t *results = buffer;
397 unsigned i, j;
398
399 for (i = 0; i < query->num_counters; ++i) {
400 struct si_query_counter *counter = &query->counters[i];
401
402 for (j = 0; j < counter->qwords; ++j) {
403 uint32_t value = results[counter->base + j * counter->stride];
404 result->batch[i].u64 += value;
405 }
406 }
407 }
408
si_pc_query_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)409 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
410 union pipe_query_result *result)
411 {
412 struct si_query_pc *query = (struct si_query_pc *)squery;
413
414 memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
415
416 for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
417 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
418 unsigned results_base = 0;
419 void *map;
420
421 if (squery->b.flushed)
422 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
423 else
424 map = si_buffer_map(sctx, qbuf->buf, usage);
425
426 if (!map)
427 return false;
428
429 while (results_base != qbuf->results_end) {
430 si_pc_query_add_result(query, map + results_base, result);
431 results_base += query->result_size;
432 }
433 }
434
435 return true;
436 }
437
438 static const struct si_query_ops batch_query_ops = {
439 .destroy = si_pc_query_destroy,
440 .begin = si_pc_query_begin,
441 .end = si_pc_query_end,
442 .get_result = si_pc_query_get_result,
443
444 .suspend = si_pc_query_suspend,
445 .resume = si_pc_query_resume,
446 };
447
get_group_state(struct si_screen * screen,struct si_query_pc * query,struct ac_pc_block * block,unsigned sub_gid)448 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
449 struct ac_pc_block *block, unsigned sub_gid)
450 {
451 struct si_perfcounters *pc = screen->perfcounters;
452 struct si_query_group *group = query->groups;
453
454 while (group) {
455 if (group->block == block && group->sub_gid == sub_gid)
456 return group;
457 group = group->next;
458 }
459
460 group = CALLOC_STRUCT(si_query_group);
461 if (!group)
462 return NULL;
463
464 group->block = block;
465 group->sub_gid = sub_gid;
466
467 if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
468 unsigned sub_gids = block->num_instances;
469 unsigned shader_id;
470 unsigned shaders;
471 unsigned query_shaders;
472
473 if (ac_pc_block_has_per_se_groups(&pc->base, block))
474 sub_gids = sub_gids * screen->info.max_se;
475 shader_id = sub_gid / sub_gids;
476 sub_gid = sub_gid % sub_gids;
477
478 shaders = ac_pc_shader_type_bits[shader_id];
479
480 query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
481 if (query_shaders && query_shaders != shaders) {
482 fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
483 FREE(group);
484 return NULL;
485 }
486 query->shaders = shaders;
487 }
488
489 if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
490 // A non-zero value in query->shaders ensures that the shader
491 // masking is reset unless the user explicitly requests one.
492 query->shaders = AC_PC_SHADERS_WINDOWING;
493 }
494
495 if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
496 group->se = sub_gid / block->num_instances;
497 sub_gid = sub_gid % block->num_instances;
498 } else {
499 group->se = -1;
500 }
501
502 if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
503 group->instance = sub_gid;
504 } else {
505 group->instance = -1;
506 }
507
508 group->next = query->groups;
509 query->groups = group;
510
511 return group;
512 }
513
si_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)514 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
515 unsigned *query_types)
516 {
517 struct si_screen *screen = (struct si_screen *)ctx->screen;
518 struct si_perfcounters *pc = screen->perfcounters;
519 struct ac_pc_block *block;
520 struct si_query_group *group;
521 struct si_query_pc *query;
522 unsigned base_gid, sub_gid, sub_index;
523 unsigned i, j;
524
525 if (!pc)
526 return NULL;
527
528 query = CALLOC_STRUCT(si_query_pc);
529 if (!query)
530 return NULL;
531
532 query->b.ops = &batch_query_ops;
533
534 query->num_counters = num_queries;
535
536 /* Collect selectors per group */
537 for (i = 0; i < num_queries; ++i) {
538 unsigned sub_gid;
539
540 if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
541 goto error;
542
543 block =
544 ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
545 if (!block)
546 goto error;
547
548 sub_gid = sub_index / block->b->selectors;
549 sub_index = sub_index % block->b->selectors;
550
551 group = get_group_state(screen, query, block, sub_gid);
552 if (!group)
553 goto error;
554
555 if (group->num_counters >= block->b->b->num_counters) {
556 fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
557 goto error;
558 }
559 group->selectors[group->num_counters] = sub_index;
560 ++group->num_counters;
561 }
562
563 /* Compute result bases and CS size per group */
564 query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
565 query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
566
567 i = 0;
568 for (group = query->groups; group; group = group->next) {
569 struct ac_pc_block *block = group->block;
570 unsigned read_dw;
571 unsigned instances = 1;
572
573 if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
574 instances = screen->info.max_se;
575 if (group->instance < 0)
576 instances *= block->num_instances;
577
578 group->result_base = i;
579 query->result_size += sizeof(uint64_t) * instances * group->num_counters;
580 i += instances * group->num_counters;
581
582 read_dw = 6 * group->num_counters;
583 query->b.num_cs_dw_suspend += instances * read_dw;
584 query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
585 }
586
587 if (query->shaders) {
588 if (query->shaders == AC_PC_SHADERS_WINDOWING)
589 query->shaders = 0xffffffff;
590 }
591
592 /* Map user-supplied query array to result indices */
593 query->counters = CALLOC(num_queries, sizeof(*query->counters));
594 for (i = 0; i < num_queries; ++i) {
595 struct si_query_counter *counter = &query->counters[i];
596 struct ac_pc_block *block;
597
598 block =
599 ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
600
601 sub_gid = sub_index / block->b->selectors;
602 sub_index = sub_index % block->b->selectors;
603
604 group = get_group_state(screen, query, block, sub_gid);
605 assert(group != NULL);
606
607 for (j = 0; j < group->num_counters; ++j) {
608 if (group->selectors[j] == sub_index)
609 break;
610 }
611
612 counter->base = group->result_base + j;
613 counter->stride = group->num_counters;
614
615 counter->qwords = 1;
616 if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
617 counter->qwords = screen->info.max_se;
618 if (group->instance < 0)
619 counter->qwords *= block->num_instances;
620 }
621
622 return (struct pipe_query *)query;
623
624 error:
625 si_pc_query_destroy((struct si_context *)ctx, &query->b);
626 return NULL;
627 }
628
si_get_perfcounter_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_info * info)629 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
630 struct pipe_driver_query_info *info)
631 {
632 struct si_perfcounters *pc = screen->perfcounters;
633 struct ac_pc_block *block;
634 unsigned base_gid, sub;
635
636 if (!pc)
637 return 0;
638
639 if (!info) {
640 unsigned bid, num_queries = 0;
641
642 for (bid = 0; bid < pc->base.num_blocks; ++bid) {
643 num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
644 }
645
646 return num_queries;
647 }
648
649 block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
650 if (!block)
651 return 0;
652
653 if (!block->selector_names) {
654 if (!ac_init_block_names(&screen->info, &pc->base, block))
655 return 0;
656 }
657 info->name = block->selector_names + sub * block->selector_name_stride;
658 info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
659 info->max_value.u64 = 0;
660 info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
661 info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
662 info->group_id = base_gid + sub / block->b->selectors;
663 info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
664 if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
665 info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
666 return 1;
667 }
668
si_get_perfcounter_group_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)669 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
670 struct pipe_driver_query_group_info *info)
671 {
672 struct si_perfcounters *pc = screen->perfcounters;
673 struct ac_pc_block *block;
674
675 if (!pc)
676 return 0;
677
678 if (!info)
679 return pc->base.num_groups;
680
681 block = ac_lookup_group(&pc->base, &index);
682 if (!block)
683 return 0;
684
685 if (!block->group_names) {
686 if (!ac_init_block_names(&screen->info, &pc->base, block))
687 return 0;
688 }
689 info->name = block->group_names + index * block->group_name_stride;
690 info->num_queries = block->b->selectors;
691 info->max_active_queries = block->b->b->num_counters;
692 return 1;
693 }
694
si_destroy_perfcounters(struct si_screen * screen)695 void si_destroy_perfcounters(struct si_screen *screen)
696 {
697 struct si_perfcounters *pc = screen->perfcounters;
698
699 if (!pc)
700 return;
701
702 ac_destroy_perfcounters(&pc->base);
703 FREE(pc);
704 screen->perfcounters = NULL;
705 }
706
si_init_perfcounters(struct si_screen * screen)707 void si_init_perfcounters(struct si_screen *screen)
708 {
709 bool separate_se, separate_instance;
710
711 separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
712 separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
713
714 screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
715 if (!screen->perfcounters)
716 return;
717
718 screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
719 screen->perfcounters->num_instance_cs_dwords = 3;
720
721 if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
722 &screen->perfcounters->base)) {
723 si_destroy_perfcounters(screen);
724 }
725 }
726
727 static bool
si_spm_init_bo(struct si_context * sctx)728 si_spm_init_bo(struct si_context *sctx)
729 {
730 struct radeon_winsys *ws = sctx->ws;
731 uint64_t size = 32 * 1024 * 1024; /* Default to 32MB. */
732
733 sctx->spm_trace.buffer_size = size;
734 sctx->spm_trace.sample_interval = 4096; /* Default to 4096 clk. */
735
736 sctx->spm_trace.bo = ws->buffer_create(
737 ws, size, 4096,
738 RADEON_DOMAIN_VRAM,
739 RADEON_FLAG_NO_INTERPROCESS_SHARING |
740 RADEON_FLAG_GTT_WC |
741 RADEON_FLAG_NO_SUBALLOC);
742
743 return sctx->spm_trace.bo != NULL;
744 }
745
746
747 static void
si_emit_spm_counters(struct si_context * sctx,struct radeon_cmdbuf * cs)748 si_emit_spm_counters(struct si_context *sctx, struct radeon_cmdbuf *cs)
749 {
750 struct ac_spm_trace_data *spm_trace = &sctx->spm_trace;
751
752 radeon_begin(cs);
753
754 for (uint32_t b = 0; b < spm_trace->num_used_sq_block_sel; b++) {
755 struct ac_spm_block_select *sq_block_sel = &spm_trace->sq_block_sel[b];
756 const struct ac_spm_counter_select *cntr_sel = &sq_block_sel->counters[0];
757 uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT;
758
759 radeon_set_uconfig_reg_seq(reg_base + b * 4, 1, false);
760 radeon_emit(cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */
761 }
762
763 for (uint32_t b = 0; b < spm_trace->num_block_sel; b++) {
764 struct ac_spm_block_select *block_sel = &spm_trace->block_sel[b];
765 struct ac_pc_block_base *regs = block_sel->b->b->b;
766
767 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, block_sel->grbm_gfx_index);
768
769 for (unsigned c = 0; c < block_sel->num_counters; c++) {
770 const struct ac_spm_counter_select *cntr_sel = &block_sel->counters[c];
771
772 if (!cntr_sel->active)
773 continue;
774
775 radeon_set_uconfig_reg_seq(regs->select0[c], 1, false);
776 radeon_emit(cntr_sel->sel0);
777
778 radeon_set_uconfig_reg_seq(regs->select1[c], 1, false);
779 radeon_emit(cntr_sel->sel1);
780 }
781 }
782
783 /* Restore global broadcasting. */
784 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
785 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
786 S_030800_INSTANCE_BROADCAST_WRITES(1));
787
788 radeon_end();
789 }
790
791 #define SPM_RING_BASE_ALIGN 32
792
793 void
si_emit_spm_setup(struct si_context * sctx,struct radeon_cmdbuf * cs)794 si_emit_spm_setup(struct si_context *sctx, struct radeon_cmdbuf *cs)
795 {
796 struct ac_spm_trace_data *spm_trace = &sctx->spm_trace;
797 uint64_t va = sctx->screen->ws->buffer_get_virtual_address(spm_trace->bo);
798 uint64_t ring_size = spm_trace->buffer_size;
799
800 /* It's required that the ring VA and the size are correctly aligned. */
801 assert(!(va & (SPM_RING_BASE_ALIGN - 1)));
802 assert(!(ring_size & (SPM_RING_BASE_ALIGN - 1)));
803 assert(spm_trace->sample_interval >= 32);
804
805 radeon_begin(cs);
806
807 /* Configure the SPM ring buffer. */
808 radeon_set_uconfig_reg(R_037200_RLC_SPM_PERFMON_CNTL,
809 S_037200_PERFMON_RING_MODE(0) | /* no stall and no interrupt on overflow */
810 S_037200_PERFMON_SAMPLE_INTERVAL(spm_trace->sample_interval)); /* in sclk */
811 radeon_set_uconfig_reg(R_037204_RLC_SPM_PERFMON_RING_BASE_LO, va);
812 radeon_set_uconfig_reg(R_037208_RLC_SPM_PERFMON_RING_BASE_HI,
813 S_037208_RING_BASE_HI(va >> 32));
814 radeon_set_uconfig_reg(R_03720C_RLC_SPM_PERFMON_RING_SIZE, ring_size);
815
816 /* Configure the muxsel. */
817 uint32_t total_muxsel_lines = 0;
818 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
819 total_muxsel_lines += spm_trace->num_muxsel_lines[s];
820 }
821
822 radeon_set_uconfig_reg(R_03726C_RLC_SPM_ACCUM_MODE, 0);
823 radeon_set_uconfig_reg(R_037210_RLC_SPM_PERFMON_SEGMENT_SIZE, 0);
824 radeon_set_uconfig_reg(R_03727C_RLC_SPM_PERFMON_SE3TO0_SEGMENT_SIZE,
825 S_03727C_SE0_NUM_LINE(spm_trace->num_muxsel_lines[0]) |
826 S_03727C_SE1_NUM_LINE(spm_trace->num_muxsel_lines[1]) |
827 S_03727C_SE2_NUM_LINE(spm_trace->num_muxsel_lines[2]) |
828 S_03727C_SE3_NUM_LINE(spm_trace->num_muxsel_lines[3]));
829 radeon_set_uconfig_reg(R_037280_RLC_SPM_PERFMON_GLB_SEGMENT_SIZE,
830 S_037280_PERFMON_SEGMENT_SIZE(total_muxsel_lines) |
831 S_037280_GLOBAL_NUM_LINE(spm_trace->num_muxsel_lines[4]));
832
833 /* Upload each muxsel ram to the RLC. */
834 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
835 unsigned rlc_muxsel_addr, rlc_muxsel_data;
836 unsigned grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1) |
837 S_030800_INSTANCE_BROADCAST_WRITES(1);
838
839 if (!spm_trace->num_muxsel_lines[s])
840 continue;
841
842 if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
843 grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
844
845 rlc_muxsel_addr = R_037224_RLC_SPM_GLOBAL_MUXSEL_ADDR;
846 rlc_muxsel_data = R_037228_RLC_SPM_GLOBAL_MUXSEL_DATA;
847 } else {
848 grbm_gfx_index |= S_030800_SE_INDEX(s);
849
850 rlc_muxsel_addr = R_03721C_RLC_SPM_SE_MUXSEL_ADDR;
851 rlc_muxsel_data = R_037220_RLC_SPM_SE_MUXSEL_DATA;
852 }
853
854 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, grbm_gfx_index);
855
856 for (unsigned l = 0; l < spm_trace->num_muxsel_lines[s]; l++) {
857 uint32_t *data = (uint32_t *)spm_trace->muxsel_lines[s][l].muxsel;
858
859 /* Select MUXSEL_ADDR to point to the next muxsel. */
860 radeon_set_uconfig_reg(rlc_muxsel_addr, l * AC_SPM_MUXSEL_LINE_SIZE);
861
862 /* Write the muxsel line configuration with MUXSEL_DATA. */
863 radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + AC_SPM_MUXSEL_LINE_SIZE, 0));
864 radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) |
865 S_370_WR_CONFIRM(1) |
866 S_370_ENGINE_SEL(V_370_ME) |
867 S_370_WR_ONE_ADDR(1));
868 radeon_emit(rlc_muxsel_data >> 2);
869 radeon_emit(0);
870 radeon_emit_array(data, AC_SPM_MUXSEL_LINE_SIZE);
871 }
872 }
873 radeon_end();
874
875 /* Select SPM counters. */
876 si_emit_spm_counters(sctx, cs);
877 }
878
879 bool
si_spm_init(struct si_context * sctx)880 si_spm_init(struct si_context *sctx)
881 {
882 const struct radeon_info *info = &sctx->screen->info;
883
884 sctx->screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
885 sctx->screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(sctx->screen);
886 sctx->screen->perfcounters->num_instance_cs_dwords = 3;
887
888 struct ac_perfcounters *pc = &sctx->screen->perfcounters->base;
889 struct ac_spm_counter_create_info spm_counters[] = {
890
891 /* XXX: doesn't work */
892 {TCP, 0, 0x9}, /* Number of L2 requests. */
893 {TCP, 0, 0x12}, /* Number of L2 misses. */
894
895 /* Scalar cache hit */
896 {SQ, 0, 0x14f}, /* Number of SCACHE hits. */
897 {SQ, 0, 0x150}, /* Number of SCACHE misses. */
898 {SQ, 0, 0x151}, /* Number of SCACHE misses duplicate. */
899
900 /* Instruction cache hit */
901 {SQ, 0, 0x12c}, /* Number of ICACHE hits. */
902 {SQ, 0, 0x12d}, /* Number of ICACHE misses. */
903 {SQ, 0, 0x12e}, /* Number of ICACHE misses duplicate. */
904
905 /* XXX: doesn't work */
906 {GL1C, 0, 0xe}, /* Number of GL1C requests. */
907 {GL1C, 0, 0x12}, /* Number of GL1C misses. */
908
909 /* L2 cache hit */
910 {GL2C, 0, 0x3}, /* Number of GL2C requests. */
911 {GL2C, 0, info->gfx_level >= GFX10_3 ? 0x2b : 0x23}, /* Number of GL2C misses. */
912 };
913
914 if (!ac_init_perfcounters(info, false, false, pc))
915 return false;
916
917 if (!ac_init_spm(info, pc, ARRAY_SIZE(spm_counters), spm_counters, &sctx->spm_trace))
918 return false;
919
920 if (!si_spm_init_bo(sctx))
921 return false;
922
923 return true;
924 }
925
926 void
si_spm_finish(struct si_context * sctx)927 si_spm_finish(struct si_context *sctx)
928 {
929 struct pb_buffer *bo = sctx->spm_trace.bo;
930 radeon_bo_reference(sctx->screen->ws, &bo, NULL);
931
932 ac_destroy_spm(&sctx->spm_trace);
933 }
934