1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_build_pm4.h"
8 #include "si_query.h"
9 #include "util/u_memory.h"
10
11 #include "ac_perfcounter.h"
12
13 struct si_query_group {
14 struct si_query_group *next;
15 struct ac_pc_block *block;
16 unsigned sub_gid; /* only used during init */
17 unsigned result_base; /* only used during init */
18 int se;
19 int instance;
20 unsigned num_counters;
21 unsigned selectors[AC_QUERY_MAX_COUNTERS];
22 };
23
24 struct si_query_counter {
25 unsigned base;
26 unsigned qwords;
27 unsigned stride; /* in uint64s */
28 };
29
30 struct si_query_pc {
31 struct si_query b;
32 struct si_query_buffer buffer;
33
34 /* Size of the results in memory, in bytes. */
35 unsigned result_size;
36
37 unsigned shaders;
38 unsigned num_counters;
39 struct si_query_counter *counters;
40 struct si_query_group *groups;
41 };
42
si_pc_emit_instance(struct si_context * sctx,int se,int instance)43 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
44 {
45 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
46 unsigned value = S_030800_SH_BROADCAST_WRITES(1);
47
48 if (se >= 0) {
49 value |= S_030800_SE_INDEX(se);
50 } else {
51 value |= S_030800_SE_BROADCAST_WRITES(1);
52 }
53
54 if (sctx->gfx_level >= GFX10) {
55 /* TODO: Expose counters from each shader array separately if needed. */
56 value |= S_030800_SA_BROADCAST_WRITES(1);
57 }
58
59 if (instance >= 0) {
60 value |= S_030800_INSTANCE_INDEX(instance);
61 } else {
62 value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
63 }
64
65 radeon_begin(cs);
66 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
67 radeon_end();
68 }
69
si_pc_emit_shaders(struct radeon_cmdbuf * cs,unsigned shaders)70 void si_pc_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
71 {
72 radeon_begin(cs);
73 radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2);
74 radeon_emit(shaders & 0x7f);
75 radeon_emit(0xffffffff);
76 radeon_end();
77 }
78
si_pc_emit_select(struct si_context * sctx,struct ac_pc_block * block,unsigned count,unsigned * selectors)79 static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
80 unsigned *selectors)
81 {
82 struct ac_pc_block_base *regs = block->b->b;
83 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
84 unsigned idx;
85
86 assert(count <= regs->num_counters);
87
88 /* Fake counters. */
89 if (!regs->select0)
90 return;
91
92 radeon_begin(cs);
93
94 for (idx = 0; idx < count; ++idx) {
95 radeon_set_uconfig_reg_seq(regs->select0[idx], 1);
96 radeon_emit(selectors[idx] | regs->select_or);
97 }
98
99 for (idx = 0; idx < regs->num_spm_counters; idx++) {
100 radeon_set_uconfig_reg_seq(regs->select1[idx], 1);
101 radeon_emit(0);
102 }
103
104 radeon_end();
105 }
106
si_pc_emit_start(struct si_context * sctx,struct si_resource * buffer,uint64_t va)107 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
108 {
109 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
110
111 si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
112 COPY_DATA_IMM, NULL, 1);
113
114 radeon_begin(cs);
115 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
116 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
117 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
118 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
119 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
120 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
121 radeon_end();
122 }
123
124 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
125 * do it again in here. */
si_pc_emit_stop(struct si_context * sctx,struct si_resource * buffer,uint64_t va)126 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
127 {
128 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
129
130 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
131 EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
132 si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
133
134 radeon_begin(cs);
135 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
136 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
137
138 if (!sctx->screen->info.never_send_perfcounter_stop) {
139 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
140 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
141 }
142
143 radeon_set_uconfig_reg(
144 R_036020_CP_PERFMON_CNTL,
145 S_036020_PERFMON_STATE(sctx->screen->info.never_stop_sq_perf_counters ?
146 V_036020_CP_PERFMON_STATE_START_COUNTING :
147 V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
148 S_036020_PERFMON_SAMPLE_ENABLE(1));
149 radeon_end();
150 }
151
si_pc_emit_spm_start(struct radeon_cmdbuf * cs)152 void si_pc_emit_spm_start(struct radeon_cmdbuf *cs)
153 {
154 radeon_begin(cs);
155
156 /* Start SPM counters. */
157 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
158 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
159 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
160 /* Start windowed performance counters. */
161 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
162 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
163 radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(1));
164
165 radeon_end();
166 }
167
si_pc_emit_spm_stop(struct radeon_cmdbuf * cs,bool never_stop_sq_perf_counters,bool never_send_perfcounter_stop)168 void si_pc_emit_spm_stop(struct radeon_cmdbuf *cs, bool never_stop_sq_perf_counters,
169 bool never_send_perfcounter_stop)
170 {
171 radeon_begin(cs);
172
173 /* Stop windowed performance counters. */
174 if (!never_send_perfcounter_stop) {
175 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
176 radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
177 }
178
179 radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(0));
180
181 /* Stop SPM counters. */
182 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
183 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
184 S_036020_SPM_PERFMON_STATE(never_stop_sq_perf_counters ?
185 V_036020_STRM_PERFMON_STATE_START_COUNTING :
186 V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
187
188 radeon_end();
189 }
190
si_pc_emit_spm_reset(struct radeon_cmdbuf * cs)191 void si_pc_emit_spm_reset(struct radeon_cmdbuf *cs)
192 {
193 radeon_begin(cs);
194 radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
195 S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
196 S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
197 radeon_end();
198 }
199
200
si_pc_emit_read(struct si_context * sctx,struct ac_pc_block * block,unsigned count,uint64_t va)201 static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
202 uint64_t va)
203 {
204 struct ac_pc_block_base *regs = block->b->b;
205 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
206 unsigned idx;
207 unsigned reg = regs->counter0_lo;
208 unsigned reg_delta = 8;
209
210 radeon_begin(cs);
211
212 if (regs->select0) {
213 for (idx = 0; idx < count; ++idx) {
214 if (regs->counters)
215 reg = regs->counters[idx];
216
217 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
218 radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
219 COPY_DATA_COUNT_SEL); /* 64 bits */
220 radeon_emit(reg >> 2);
221 radeon_emit(0); /* unused */
222 radeon_emit(va);
223 radeon_emit(va >> 32);
224 va += sizeof(uint64_t);
225 reg += reg_delta;
226 }
227 } else {
228 /* Fake counters. */
229 for (idx = 0; idx < count; ++idx) {
230 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
231 radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
232 COPY_DATA_COUNT_SEL);
233 radeon_emit(0); /* immediate */
234 radeon_emit(0);
235 radeon_emit(va);
236 radeon_emit(va >> 32);
237 va += sizeof(uint64_t);
238 }
239 }
240 radeon_end();
241 }
242
si_pc_query_destroy(struct si_context * sctx,struct si_query * squery)243 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
244 {
245 struct si_query_pc *query = (struct si_query_pc *)squery;
246
247 while (query->groups) {
248 struct si_query_group *group = query->groups;
249 query->groups = group->next;
250 FREE(group);
251 }
252
253 FREE(query->counters);
254
255 si_query_buffer_destroy(sctx->screen, &query->buffer);
256 FREE(query);
257 }
258
si_inhibit_clockgating(struct si_context * sctx,struct radeon_cmdbuf * cs,bool inhibit)259 void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit)
260 {
261 if (sctx->gfx_level >= GFX11)
262 return;
263
264 radeon_begin(&sctx->gfx_cs);
265
266 if (sctx->gfx_level >= GFX10) {
267 radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
268 S_037390_PERFMON_CLOCK_STATE(inhibit));
269 } else if (sctx->gfx_level >= GFX8) {
270 radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
271 S_0372FC_PERFMON_CLOCK_STATE(inhibit));
272 }
273 radeon_end();
274 }
275
si_pc_query_resume(struct si_context * sctx,struct si_query * squery)276 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
277 /*
278 struct si_query_hw *hwquery,
279 struct si_resource *buffer, uint64_t va)*/
280 {
281 struct si_query_pc *query = (struct si_query_pc *)squery;
282 int current_se = -1;
283 int current_instance = -1;
284
285 if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
286 return;
287 si_need_gfx_cs_space(sctx, 0);
288
289 if (query->shaders)
290 si_pc_emit_shaders(&sctx->gfx_cs, query->shaders);
291
292 si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
293
294 for (struct si_query_group *group = query->groups; group; group = group->next) {
295 struct ac_pc_block *block = group->block;
296
297 if (group->se != current_se || group->instance != current_instance) {
298 current_se = group->se;
299 current_instance = group->instance;
300 si_pc_emit_instance(sctx, group->se, group->instance);
301 }
302
303 si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
304 }
305
306 if (current_se != -1 || current_instance != -1)
307 si_pc_emit_instance(sctx, -1, -1);
308
309 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
310 si_pc_emit_start(sctx, query->buffer.buf, va);
311 }
312
si_pc_query_suspend(struct si_context * sctx,struct si_query * squery)313 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
314 {
315 struct si_query_pc *query = (struct si_query_pc *)squery;
316
317 if (!query->buffer.buf)
318 return;
319
320 uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
321 query->buffer.results_end += query->result_size;
322
323 si_pc_emit_stop(sctx, query->buffer.buf, va);
324
325 for (struct si_query_group *group = query->groups; group; group = group->next) {
326 struct ac_pc_block *block = group->block;
327 unsigned se = group->se >= 0 ? group->se : 0;
328 unsigned se_end = se + 1;
329
330 if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
331 se_end = sctx->screen->info.max_se;
332
333 do {
334 unsigned instance = group->instance >= 0 ? group->instance : 0;
335
336 do {
337 si_pc_emit_instance(sctx, se, instance);
338 si_pc_emit_read(sctx, block, group->num_counters, va);
339 va += sizeof(uint64_t) * group->num_counters;
340 } while (group->instance < 0 && ++instance < block->num_instances);
341 } while (++se < se_end);
342 }
343
344 si_pc_emit_instance(sctx, -1, -1);
345
346 si_inhibit_clockgating(sctx, &sctx->gfx_cs, false);
347 }
348
si_pc_query_begin(struct si_context * ctx,struct si_query * squery)349 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
350 {
351 struct si_query_pc *query = (struct si_query_pc *)squery;
352
353 si_query_buffer_reset(ctx, &query->buffer);
354
355 list_addtail(&query->b.active_list, &ctx->active_queries);
356 ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
357
358 si_pc_query_resume(ctx, squery);
359
360 return true;
361 }
362
si_pc_query_end(struct si_context * ctx,struct si_query * squery)363 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
364 {
365 struct si_query_pc *query = (struct si_query_pc *)squery;
366
367 si_pc_query_suspend(ctx, squery);
368
369 list_del(&squery->active_list);
370 ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
371
372 return query->buffer.buf != NULL;
373 }
374
si_pc_query_add_result(struct si_query_pc * query,void * buffer,union pipe_query_result * result)375 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
376 union pipe_query_result *result)
377 {
378 uint64_t *results = buffer;
379 unsigned i, j;
380
381 for (i = 0; i < query->num_counters; ++i) {
382 struct si_query_counter *counter = &query->counters[i];
383
384 for (j = 0; j < counter->qwords; ++j) {
385 uint32_t value = results[counter->base + j * counter->stride];
386 result->batch[i].u64 += value;
387 }
388 }
389 }
390
si_pc_query_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)391 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
392 union pipe_query_result *result)
393 {
394 struct si_query_pc *query = (struct si_query_pc *)squery;
395
396 memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
397
398 for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
399 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
400 unsigned results_base = 0;
401 void *map;
402
403 if (squery->b.flushed)
404 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
405 else
406 map = si_buffer_map(sctx, qbuf->buf, usage);
407
408 if (!map)
409 return false;
410
411 while (results_base != qbuf->results_end) {
412 si_pc_query_add_result(query, map + results_base, result);
413 results_base += query->result_size;
414 }
415 }
416
417 return true;
418 }
419
420 static const struct si_query_ops batch_query_ops = {
421 .destroy = si_pc_query_destroy,
422 .begin = si_pc_query_begin,
423 .end = si_pc_query_end,
424 .get_result = si_pc_query_get_result,
425
426 .suspend = si_pc_query_suspend,
427 .resume = si_pc_query_resume,
428 };
429
get_group_state(struct si_screen * screen,struct si_query_pc * query,struct ac_pc_block * block,unsigned sub_gid)430 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
431 struct ac_pc_block *block, unsigned sub_gid)
432 {
433 struct si_perfcounters *pc = screen->perfcounters;
434 struct si_query_group *group = query->groups;
435
436 while (group) {
437 if (group->block == block && group->sub_gid == sub_gid)
438 return group;
439 group = group->next;
440 }
441
442 group = CALLOC_STRUCT(si_query_group);
443 if (!group)
444 return NULL;
445
446 group->block = block;
447 group->sub_gid = sub_gid;
448
449 if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
450 unsigned sub_gids = block->num_instances;
451 unsigned shader_id;
452 unsigned shaders;
453 unsigned query_shaders;
454
455 if (ac_pc_block_has_per_se_groups(&pc->base, block))
456 sub_gids = sub_gids * screen->info.max_se;
457 shader_id = sub_gid / sub_gids;
458 sub_gid = sub_gid % sub_gids;
459
460 shaders = ac_pc_shader_type_bits[shader_id];
461
462 query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
463 if (query_shaders && query_shaders != shaders) {
464 fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
465 FREE(group);
466 return NULL;
467 }
468 query->shaders = shaders;
469 }
470
471 if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
472 // A non-zero value in query->shaders ensures that the shader
473 // masking is reset unless the user explicitly requests one.
474 query->shaders = AC_PC_SHADERS_WINDOWING;
475 }
476
477 if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
478 group->se = sub_gid / block->num_instances;
479 sub_gid = sub_gid % block->num_instances;
480 } else {
481 group->se = -1;
482 }
483
484 if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
485 group->instance = sub_gid;
486 } else {
487 group->instance = -1;
488 }
489
490 group->next = query->groups;
491 query->groups = group;
492
493 return group;
494 }
495
si_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)496 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
497 unsigned *query_types)
498 {
499 struct si_screen *screen = (struct si_screen *)ctx->screen;
500 struct si_perfcounters *pc = screen->perfcounters;
501 struct ac_pc_block *block;
502 struct si_query_group *group;
503 struct si_query_pc *query;
504 unsigned base_gid, sub_gid, sub_index;
505 unsigned i, j;
506
507 if (!pc)
508 return NULL;
509
510 query = CALLOC_STRUCT(si_query_pc);
511 if (!query)
512 return NULL;
513
514 query->b.ops = &batch_query_ops;
515
516 query->num_counters = num_queries;
517
518 /* Collect selectors per group */
519 for (i = 0; i < num_queries; ++i) {
520 unsigned sub_gid;
521
522 if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
523 goto error;
524
525 block =
526 ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
527 if (!block)
528 goto error;
529
530 sub_gid = sub_index / block->b->selectors;
531 sub_index = sub_index % block->b->selectors;
532
533 group = get_group_state(screen, query, block, sub_gid);
534 if (!group)
535 goto error;
536
537 if (group->num_counters >= block->b->b->num_counters) {
538 fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
539 goto error;
540 }
541 group->selectors[group->num_counters] = sub_index;
542 ++group->num_counters;
543 }
544
545 /* Compute result bases and CS size per group */
546 query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
547 query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
548
549 i = 0;
550 for (group = query->groups; group; group = group->next) {
551 struct ac_pc_block *block = group->block;
552 unsigned read_dw;
553 unsigned instances = 1;
554
555 if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
556 instances = screen->info.max_se;
557 if (group->instance < 0)
558 instances *= block->num_instances;
559
560 group->result_base = i;
561 query->result_size += sizeof(uint64_t) * instances * group->num_counters;
562 i += instances * group->num_counters;
563
564 read_dw = 6 * group->num_counters;
565 query->b.num_cs_dw_suspend += instances * read_dw;
566 query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
567 }
568
569 if (query->shaders) {
570 if (query->shaders == AC_PC_SHADERS_WINDOWING)
571 query->shaders = 0xffffffff;
572 }
573
574 /* Map user-supplied query array to result indices */
575 query->counters = CALLOC(num_queries, sizeof(*query->counters));
576 for (i = 0; i < num_queries; ++i) {
577 struct si_query_counter *counter = &query->counters[i];
578 struct ac_pc_block *block;
579
580 block =
581 ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
582
583 sub_gid = sub_index / block->b->selectors;
584 sub_index = sub_index % block->b->selectors;
585
586 group = get_group_state(screen, query, block, sub_gid);
587 assert(group != NULL);
588
589 for (j = 0; j < group->num_counters; ++j) {
590 if (group->selectors[j] == sub_index)
591 break;
592 }
593
594 counter->base = group->result_base + j;
595 counter->stride = group->num_counters;
596
597 counter->qwords = 1;
598 if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
599 counter->qwords = screen->info.max_se;
600 if (group->instance < 0)
601 counter->qwords *= block->num_instances;
602 }
603
604 return (struct pipe_query *)query;
605
606 error:
607 si_pc_query_destroy((struct si_context *)ctx, &query->b);
608 return NULL;
609 }
610
si_get_perfcounter_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_info * info)611 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
612 struct pipe_driver_query_info *info)
613 {
614 struct si_perfcounters *pc = screen->perfcounters;
615 struct ac_pc_block *block;
616 unsigned base_gid, sub;
617
618 if (!pc)
619 return 0;
620
621 if (!info) {
622 unsigned bid, num_queries = 0;
623
624 for (bid = 0; bid < pc->base.num_blocks; ++bid) {
625 num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
626 }
627
628 return num_queries;
629 }
630
631 block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
632 if (!block)
633 return 0;
634
635 if (!block->selector_names) {
636 if (!ac_init_block_names(&screen->info, &pc->base, block))
637 return 0;
638 }
639 info->name = block->selector_names + sub * block->selector_name_stride;
640 info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
641 info->max_value.u64 = 0;
642 info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
643 info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
644 info->group_id = base_gid + sub / block->b->selectors;
645 info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
646 if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
647 info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
648 return 1;
649 }
650
si_get_perfcounter_group_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)651 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
652 struct pipe_driver_query_group_info *info)
653 {
654 struct si_perfcounters *pc = screen->perfcounters;
655 struct ac_pc_block *block;
656
657 if (!pc)
658 return 0;
659
660 if (!info)
661 return pc->base.num_groups;
662
663 block = ac_lookup_group(&pc->base, &index);
664 if (!block)
665 return 0;
666
667 if (!block->group_names) {
668 if (!ac_init_block_names(&screen->info, &pc->base, block))
669 return 0;
670 }
671 info->name = block->group_names + index * block->group_name_stride;
672 info->num_queries = block->b->selectors;
673 info->max_active_queries = block->b->b->num_counters;
674 return 1;
675 }
676
si_destroy_perfcounters(struct si_screen * screen)677 void si_destroy_perfcounters(struct si_screen *screen)
678 {
679 struct si_perfcounters *pc = screen->perfcounters;
680
681 if (!pc)
682 return;
683
684 ac_destroy_perfcounters(&pc->base);
685 FREE(pc);
686 screen->perfcounters = NULL;
687 }
688
si_init_perfcounters(struct si_screen * screen)689 void si_init_perfcounters(struct si_screen *screen)
690 {
691 bool separate_se, separate_instance;
692
693 separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
694 separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
695
696 screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
697 if (!screen->perfcounters)
698 return;
699
700 screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
701 screen->perfcounters->num_instance_cs_dwords = 3;
702
703 if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
704 &screen->perfcounters->base)) {
705 si_destroy_perfcounters(screen);
706 }
707 }
708
709 static bool
si_spm_init_bo(struct si_context * sctx)710 si_spm_init_bo(struct si_context *sctx)
711 {
712 struct radeon_winsys *ws = sctx->ws;
713 uint64_t size = 32 * 1024 * 1024; /* Default to 32MB. */
714
715 sctx->spm.buffer_size = size;
716 sctx->spm.sample_interval = 4096; /* Default to 4096 clk. */
717
718 sctx->spm.bo = ws->buffer_create(
719 ws, size, 4096,
720 RADEON_DOMAIN_VRAM,
721 RADEON_FLAG_NO_INTERPROCESS_SHARING |
722 RADEON_FLAG_GTT_WC |
723 RADEON_FLAG_NO_SUBALLOC);
724
725 return sctx->spm.bo != NULL;
726 }
727
728
729 static void
si_emit_spm_counters(struct si_context * sctx,struct radeon_cmdbuf * cs)730 si_emit_spm_counters(struct si_context *sctx, struct radeon_cmdbuf *cs)
731 {
732 struct ac_spm *spm = &sctx->spm;
733
734 radeon_begin(cs);
735
736 for (uint32_t instance = 0; instance < ARRAY_SIZE(spm->sqg); instance++) {
737 uint32_t num_counters = spm->sqg[instance].num_counters;
738
739 if (!num_counters)
740 continue;
741
742 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
743 S_030800_SH_BROADCAST_WRITES(1) |
744 S_030800_INSTANCE_BROADCAST_WRITES(1) |
745 S_030800_SE_INDEX(instance));
746
747 for (uint32_t b = 0; b < num_counters; b++) {
748 const struct ac_spm_counter_select *cntr_sel = &spm->sqg[instance].counters[b];
749 uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT;
750
751 radeon_set_uconfig_reg_seq(reg_base + b * 4, 1);
752 radeon_emit(cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */
753 }
754 }
755
756 for (uint32_t b = 0; b < spm->num_block_sel; b++) {
757 struct ac_spm_block_select *block_sel = &spm->block_sel[b];
758 struct ac_pc_block_base *regs = block_sel->b->b->b;
759
760 for (unsigned i = 0; i < block_sel->num_instances; i++) {
761 struct ac_spm_block_instance *block_instance = &block_sel->instances[i];
762
763 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, block_instance->grbm_gfx_index);
764
765 for (unsigned c = 0; c < block_instance->num_counters; c++) {
766 const struct ac_spm_counter_select *cntr_sel = &block_instance->counters[c];
767
768 if (!cntr_sel->active)
769 continue;
770
771 radeon_set_uconfig_reg_seq(regs->select0[c], 1);
772 radeon_emit(cntr_sel->sel0);
773
774 radeon_set_uconfig_reg_seq(regs->select1[c], 1);
775 radeon_emit(cntr_sel->sel1);
776 }
777 }
778 }
779
780 /* Restore global broadcasting. */
781 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
782 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
783 S_030800_INSTANCE_BROADCAST_WRITES(1));
784
785 radeon_end();
786 }
787
788 #define SPM_RING_BASE_ALIGN 32
789
790 void
si_emit_spm_setup(struct si_context * sctx,struct radeon_cmdbuf * cs)791 si_emit_spm_setup(struct si_context *sctx, struct radeon_cmdbuf *cs)
792 {
793 struct ac_spm *spm = &sctx->spm;
794 uint64_t va = sctx->screen->ws->buffer_get_virtual_address(spm->bo);
795 uint64_t ring_size = spm->buffer_size;
796
797 /* It's required that the ring VA and the size are correctly aligned. */
798 assert(!(va & (SPM_RING_BASE_ALIGN - 1)));
799 assert(!(ring_size & (SPM_RING_BASE_ALIGN - 1)));
800 assert(spm->sample_interval >= 32);
801
802 radeon_begin(cs);
803
804 /* Configure the SPM ring buffer. */
805 radeon_set_uconfig_reg(R_037200_RLC_SPM_PERFMON_CNTL,
806 S_037200_PERFMON_RING_MODE(0) | /* no stall and no interrupt on overflow */
807 S_037200_PERFMON_SAMPLE_INTERVAL(spm->sample_interval)); /* in sclk */
808 radeon_set_uconfig_reg(R_037204_RLC_SPM_PERFMON_RING_BASE_LO, va);
809 radeon_set_uconfig_reg(R_037208_RLC_SPM_PERFMON_RING_BASE_HI,
810 S_037208_RING_BASE_HI(va >> 32));
811 radeon_set_uconfig_reg(R_03720C_RLC_SPM_PERFMON_RING_SIZE, ring_size);
812
813 /* Configure the muxsel. */
814 uint32_t total_muxsel_lines = 0;
815 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
816 total_muxsel_lines += spm->num_muxsel_lines[s];
817 }
818
819 radeon_set_uconfig_reg(R_03726C_RLC_SPM_ACCUM_MODE, 0);
820 radeon_set_uconfig_reg(R_037210_RLC_SPM_PERFMON_SEGMENT_SIZE, 0);
821 radeon_set_uconfig_reg(R_03727C_RLC_SPM_PERFMON_SE3TO0_SEGMENT_SIZE,
822 S_03727C_SE0_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE0]) |
823 S_03727C_SE1_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE1]) |
824 S_03727C_SE2_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE2]) |
825 S_03727C_SE3_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE3]));
826 radeon_set_uconfig_reg(R_037280_RLC_SPM_PERFMON_GLB_SEGMENT_SIZE,
827 S_037280_PERFMON_SEGMENT_SIZE(total_muxsel_lines) |
828 S_037280_GLOBAL_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_GLOBAL]));
829
830 /* Upload each muxsel ram to the RLC. */
831 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
832 unsigned rlc_muxsel_addr, rlc_muxsel_data;
833 unsigned grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1) |
834 S_030800_INSTANCE_BROADCAST_WRITES(1);
835
836 if (!spm->num_muxsel_lines[s])
837 continue;
838
839 if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
840 grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
841
842 rlc_muxsel_addr = R_037224_RLC_SPM_GLOBAL_MUXSEL_ADDR;
843 rlc_muxsel_data = R_037228_RLC_SPM_GLOBAL_MUXSEL_DATA;
844 } else {
845 grbm_gfx_index |= S_030800_SE_INDEX(s);
846
847 rlc_muxsel_addr = R_03721C_RLC_SPM_SE_MUXSEL_ADDR;
848 rlc_muxsel_data = R_037220_RLC_SPM_SE_MUXSEL_DATA;
849 }
850
851 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, grbm_gfx_index);
852
853 for (unsigned l = 0; l < spm->num_muxsel_lines[s]; l++) {
854 uint32_t *data = (uint32_t *)spm->muxsel_lines[s][l].muxsel;
855
856 /* Select MUXSEL_ADDR to point to the next muxsel. */
857 radeon_set_uconfig_reg(rlc_muxsel_addr, l * AC_SPM_MUXSEL_LINE_SIZE);
858
859 /* Write the muxsel line configuration with MUXSEL_DATA. */
860 radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + AC_SPM_MUXSEL_LINE_SIZE, 0));
861 radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) |
862 S_370_WR_CONFIRM(1) |
863 S_370_ENGINE_SEL(V_370_ME) |
864 S_370_WR_ONE_ADDR(1));
865 radeon_emit(rlc_muxsel_data >> 2);
866 radeon_emit(0);
867 radeon_emit_array(data, AC_SPM_MUXSEL_LINE_SIZE);
868 }
869 }
870 radeon_end();
871
872 /* Select SPM counters. */
873 si_emit_spm_counters(sctx, cs);
874 }
875
876 bool
si_spm_init(struct si_context * sctx)877 si_spm_init(struct si_context *sctx)
878 {
879 const struct radeon_info *info = &sctx->screen->info;
880
881 sctx->screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
882 sctx->screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(sctx->screen);
883 sctx->screen->perfcounters->num_instance_cs_dwords = 3;
884
885 struct ac_perfcounters *pc = &sctx->screen->perfcounters->base;
886
887 if (!ac_init_perfcounters(info, false, false, pc))
888 return false;
889
890 if (!ac_init_spm(info, pc, &sctx->spm))
891 return false;
892
893 if (!si_spm_init_bo(sctx))
894 return false;
895
896 return true;
897 }
898
899 void
si_spm_finish(struct si_context * sctx)900 si_spm_finish(struct si_context *sctx)
901 {
902 struct pb_buffer_lean *bo = sctx->spm.bo;
903 radeon_bo_reference(sctx->screen->ws, &bo, NULL);
904
905 ac_destroy_spm(&sctx->spm);
906 }
907