• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_build_pm4.h"
8 #include "si_query.h"
9 #include "util/u_memory.h"
10 
11 #include "ac_perfcounter.h"
12 
13 struct si_query_group {
14    struct si_query_group *next;
15    struct ac_pc_block *block;
16    unsigned sub_gid;     /* only used during init */
17    unsigned result_base; /* only used during init */
18    int se;
19    int instance;
20    unsigned num_counters;
21    unsigned selectors[AC_QUERY_MAX_COUNTERS];
22 };
23 
24 struct si_query_counter {
25    unsigned base;
26    unsigned qwords;
27    unsigned stride; /* in uint64s */
28 };
29 
30 struct si_query_pc {
31    struct si_query b;
32    struct si_query_buffer buffer;
33 
34    /* Size of the results in memory, in bytes. */
35    unsigned result_size;
36 
37    unsigned shaders;
38    unsigned num_counters;
39    struct si_query_counter *counters;
40    struct si_query_group *groups;
41 };
42 
si_pc_emit_instance(struct si_context * sctx,int se,int instance)43 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
44 {
45    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
46    unsigned value = S_030800_SH_BROADCAST_WRITES(1);
47 
48    if (se >= 0) {
49       value |= S_030800_SE_INDEX(se);
50    } else {
51       value |= S_030800_SE_BROADCAST_WRITES(1);
52    }
53 
54    if (sctx->gfx_level >= GFX10) {
55       /* TODO: Expose counters from each shader array separately if needed. */
56       value |= S_030800_SA_BROADCAST_WRITES(1);
57    }
58 
59    if (instance >= 0) {
60       value |= S_030800_INSTANCE_INDEX(instance);
61    } else {
62       value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
63    }
64 
65    radeon_begin(cs);
66    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
67    radeon_end();
68 }
69 
si_pc_emit_shaders(struct radeon_cmdbuf * cs,unsigned shaders)70 void si_pc_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
71 {
72    radeon_begin(cs);
73    radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2);
74    radeon_emit(shaders & 0x7f);
75    radeon_emit(0xffffffff);
76    radeon_end();
77 }
78 
si_pc_emit_select(struct si_context * sctx,struct ac_pc_block * block,unsigned count,unsigned * selectors)79 static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
80                               unsigned *selectors)
81 {
82    struct ac_pc_block_base *regs = block->b->b;
83    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
84    unsigned idx;
85 
86    assert(count <= regs->num_counters);
87 
88    /* Fake counters. */
89    if (!regs->select0)
90       return;
91 
92    radeon_begin(cs);
93 
94    for (idx = 0; idx < count; ++idx) {
95       radeon_set_uconfig_reg_seq(regs->select0[idx], 1);
96       radeon_emit(selectors[idx] | regs->select_or);
97    }
98 
99    for (idx = 0; idx < regs->num_spm_counters; idx++) {
100       radeon_set_uconfig_reg_seq(regs->select1[idx], 1);
101       radeon_emit(0);
102    }
103 
104    radeon_end();
105 }
106 
si_pc_emit_start(struct si_context * sctx,struct si_resource * buffer,uint64_t va)107 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
108 {
109    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
110 
111    si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
112                    COPY_DATA_IMM, NULL, 1);
113 
114    radeon_begin(cs);
115    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
116                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
117    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
118    radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
119    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
120                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
121    radeon_end();
122 }
123 
124 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
125  * do it again in here. */
si_pc_emit_stop(struct si_context * sctx,struct si_resource * buffer,uint64_t va)126 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
127 {
128    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
129 
130    si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
131                      EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
132    si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
133 
134    radeon_begin(cs);
135    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
136    radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
137 
138    if (!sctx->screen->info.never_send_perfcounter_stop) {
139       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
140       radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
141    }
142 
143    radeon_set_uconfig_reg(
144       R_036020_CP_PERFMON_CNTL,
145       S_036020_PERFMON_STATE(sctx->screen->info.never_stop_sq_perf_counters ?
146                                 V_036020_CP_PERFMON_STATE_START_COUNTING :
147                                 V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
148       S_036020_PERFMON_SAMPLE_ENABLE(1));
149    radeon_end();
150 }
151 
si_pc_emit_spm_start(struct radeon_cmdbuf * cs)152 void si_pc_emit_spm_start(struct radeon_cmdbuf *cs)
153 {
154    radeon_begin(cs);
155 
156    /* Start SPM counters. */
157    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
158                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
159                              S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
160    /* Start windowed performance counters. */
161    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
162    radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
163    radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(1));
164 
165    radeon_end();
166 }
167 
si_pc_emit_spm_stop(struct radeon_cmdbuf * cs,bool never_stop_sq_perf_counters,bool never_send_perfcounter_stop)168 void si_pc_emit_spm_stop(struct radeon_cmdbuf *cs, bool never_stop_sq_perf_counters,
169                          bool never_send_perfcounter_stop)
170 {
171    radeon_begin(cs);
172 
173    /* Stop windowed performance counters. */
174    if (!never_send_perfcounter_stop) {
175       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
176       radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
177    }
178 
179    radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(0));
180 
181    /* Stop SPM counters. */
182    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
183                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
184                           S_036020_SPM_PERFMON_STATE(never_stop_sq_perf_counters ?
185                              V_036020_STRM_PERFMON_STATE_START_COUNTING :
186                              V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
187 
188    radeon_end();
189 }
190 
si_pc_emit_spm_reset(struct radeon_cmdbuf * cs)191 void si_pc_emit_spm_reset(struct radeon_cmdbuf *cs)
192 {
193    radeon_begin(cs);
194    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
195                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
196                           S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
197    radeon_end();
198 }
199 
200 
si_pc_emit_read(struct si_context * sctx,struct ac_pc_block * block,unsigned count,uint64_t va)201 static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
202                             uint64_t va)
203 {
204    struct ac_pc_block_base *regs = block->b->b;
205    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
206    unsigned idx;
207    unsigned reg = regs->counter0_lo;
208    unsigned reg_delta = 8;
209 
210    radeon_begin(cs);
211 
212    if (regs->select0) {
213       for (idx = 0; idx < count; ++idx) {
214          if (regs->counters)
215             reg = regs->counters[idx];
216 
217          radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
218          radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
219                             COPY_DATA_COUNT_SEL); /* 64 bits */
220          radeon_emit(reg >> 2);
221          radeon_emit(0); /* unused */
222          radeon_emit(va);
223          radeon_emit(va >> 32);
224          va += sizeof(uint64_t);
225          reg += reg_delta;
226       }
227    } else {
228       /* Fake counters. */
229       for (idx = 0; idx < count; ++idx) {
230          radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
231          radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
232                      COPY_DATA_COUNT_SEL);
233          radeon_emit(0); /* immediate */
234          radeon_emit(0);
235          radeon_emit(va);
236          radeon_emit(va >> 32);
237          va += sizeof(uint64_t);
238       }
239    }
240    radeon_end();
241 }
242 
si_pc_query_destroy(struct si_context * sctx,struct si_query * squery)243 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
244 {
245    struct si_query_pc *query = (struct si_query_pc *)squery;
246 
247    while (query->groups) {
248       struct si_query_group *group = query->groups;
249       query->groups = group->next;
250       FREE(group);
251    }
252 
253    FREE(query->counters);
254 
255    si_query_buffer_destroy(sctx->screen, &query->buffer);
256    FREE(query);
257 }
258 
si_inhibit_clockgating(struct si_context * sctx,struct radeon_cmdbuf * cs,bool inhibit)259 void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit)
260 {
261    if (sctx->gfx_level >= GFX11)
262       return;
263 
264    radeon_begin(&sctx->gfx_cs);
265 
266    if (sctx->gfx_level >= GFX10) {
267       radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
268                              S_037390_PERFMON_CLOCK_STATE(inhibit));
269    } else if (sctx->gfx_level >= GFX8) {
270       radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
271                              S_0372FC_PERFMON_CLOCK_STATE(inhibit));
272    }
273    radeon_end();
274 }
275 
si_pc_query_resume(struct si_context * sctx,struct si_query * squery)276 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
277 /*
278                                    struct si_query_hw *hwquery,
279                                    struct si_resource *buffer, uint64_t va)*/
280 {
281    struct si_query_pc *query = (struct si_query_pc *)squery;
282    int current_se = -1;
283    int current_instance = -1;
284 
285    if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
286       return;
287    si_need_gfx_cs_space(sctx, 0);
288 
289    if (query->shaders)
290       si_pc_emit_shaders(&sctx->gfx_cs, query->shaders);
291 
292    si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
293 
294    for (struct si_query_group *group = query->groups; group; group = group->next) {
295       struct ac_pc_block *block = group->block;
296 
297       if (group->se != current_se || group->instance != current_instance) {
298          current_se = group->se;
299          current_instance = group->instance;
300          si_pc_emit_instance(sctx, group->se, group->instance);
301       }
302 
303       si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
304    }
305 
306    if (current_se != -1 || current_instance != -1)
307       si_pc_emit_instance(sctx, -1, -1);
308 
309    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
310    si_pc_emit_start(sctx, query->buffer.buf, va);
311 }
312 
si_pc_query_suspend(struct si_context * sctx,struct si_query * squery)313 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
314 {
315    struct si_query_pc *query = (struct si_query_pc *)squery;
316 
317    if (!query->buffer.buf)
318       return;
319 
320    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
321    query->buffer.results_end += query->result_size;
322 
323    si_pc_emit_stop(sctx, query->buffer.buf, va);
324 
325    for (struct si_query_group *group = query->groups; group; group = group->next) {
326       struct ac_pc_block *block = group->block;
327       unsigned se = group->se >= 0 ? group->se : 0;
328       unsigned se_end = se + 1;
329 
330       if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
331          se_end = sctx->screen->info.max_se;
332 
333       do {
334          unsigned instance = group->instance >= 0 ? group->instance : 0;
335 
336          do {
337             si_pc_emit_instance(sctx, se, instance);
338             si_pc_emit_read(sctx, block, group->num_counters, va);
339             va += sizeof(uint64_t) * group->num_counters;
340          } while (group->instance < 0 && ++instance < block->num_instances);
341       } while (++se < se_end);
342    }
343 
344    si_pc_emit_instance(sctx, -1, -1);
345 
346    si_inhibit_clockgating(sctx, &sctx->gfx_cs, false);
347 }
348 
si_pc_query_begin(struct si_context * ctx,struct si_query * squery)349 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
350 {
351    struct si_query_pc *query = (struct si_query_pc *)squery;
352 
353    si_query_buffer_reset(ctx, &query->buffer);
354 
355    list_addtail(&query->b.active_list, &ctx->active_queries);
356    ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
357 
358    si_pc_query_resume(ctx, squery);
359 
360    return true;
361 }
362 
si_pc_query_end(struct si_context * ctx,struct si_query * squery)363 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
364 {
365    struct si_query_pc *query = (struct si_query_pc *)squery;
366 
367    si_pc_query_suspend(ctx, squery);
368 
369    list_del(&squery->active_list);
370    ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
371 
372    return query->buffer.buf != NULL;
373 }
374 
si_pc_query_add_result(struct si_query_pc * query,void * buffer,union pipe_query_result * result)375 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
376                                    union pipe_query_result *result)
377 {
378    uint64_t *results = buffer;
379    unsigned i, j;
380 
381    for (i = 0; i < query->num_counters; ++i) {
382       struct si_query_counter *counter = &query->counters[i];
383 
384       for (j = 0; j < counter->qwords; ++j) {
385          uint32_t value = results[counter->base + j * counter->stride];
386          result->batch[i].u64 += value;
387       }
388    }
389 }
390 
si_pc_query_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)391 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
392                                    union pipe_query_result *result)
393 {
394    struct si_query_pc *query = (struct si_query_pc *)squery;
395 
396    memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
397 
398    for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
399       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
400       unsigned results_base = 0;
401       void *map;
402 
403       if (squery->b.flushed)
404          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
405       else
406          map = si_buffer_map(sctx, qbuf->buf, usage);
407 
408       if (!map)
409          return false;
410 
411       while (results_base != qbuf->results_end) {
412          si_pc_query_add_result(query, map + results_base, result);
413          results_base += query->result_size;
414       }
415    }
416 
417    return true;
418 }
419 
420 static const struct si_query_ops batch_query_ops = {
421    .destroy = si_pc_query_destroy,
422    .begin = si_pc_query_begin,
423    .end = si_pc_query_end,
424    .get_result = si_pc_query_get_result,
425 
426    .suspend = si_pc_query_suspend,
427    .resume = si_pc_query_resume,
428 };
429 
get_group_state(struct si_screen * screen,struct si_query_pc * query,struct ac_pc_block * block,unsigned sub_gid)430 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
431                                               struct ac_pc_block *block, unsigned sub_gid)
432 {
433    struct si_perfcounters *pc = screen->perfcounters;
434    struct si_query_group *group = query->groups;
435 
436    while (group) {
437       if (group->block == block && group->sub_gid == sub_gid)
438          return group;
439       group = group->next;
440    }
441 
442    group = CALLOC_STRUCT(si_query_group);
443    if (!group)
444       return NULL;
445 
446    group->block = block;
447    group->sub_gid = sub_gid;
448 
449    if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
450       unsigned sub_gids = block->num_instances;
451       unsigned shader_id;
452       unsigned shaders;
453       unsigned query_shaders;
454 
455       if (ac_pc_block_has_per_se_groups(&pc->base, block))
456          sub_gids = sub_gids * screen->info.max_se;
457       shader_id = sub_gid / sub_gids;
458       sub_gid = sub_gid % sub_gids;
459 
460       shaders = ac_pc_shader_type_bits[shader_id];
461 
462       query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
463       if (query_shaders && query_shaders != shaders) {
464          fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
465          FREE(group);
466          return NULL;
467       }
468       query->shaders = shaders;
469    }
470 
471    if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
472       // A non-zero value in query->shaders ensures that the shader
473       // masking is reset unless the user explicitly requests one.
474       query->shaders = AC_PC_SHADERS_WINDOWING;
475    }
476 
477    if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
478       group->se = sub_gid / block->num_instances;
479       sub_gid = sub_gid % block->num_instances;
480    } else {
481       group->se = -1;
482    }
483 
484    if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
485       group->instance = sub_gid;
486    } else {
487       group->instance = -1;
488    }
489 
490    group->next = query->groups;
491    query->groups = group;
492 
493    return group;
494 }
495 
si_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)496 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
497                                          unsigned *query_types)
498 {
499    struct si_screen *screen = (struct si_screen *)ctx->screen;
500    struct si_perfcounters *pc = screen->perfcounters;
501    struct ac_pc_block *block;
502    struct si_query_group *group;
503    struct si_query_pc *query;
504    unsigned base_gid, sub_gid, sub_index;
505    unsigned i, j;
506 
507    if (!pc)
508       return NULL;
509 
510    query = CALLOC_STRUCT(si_query_pc);
511    if (!query)
512       return NULL;
513 
514    query->b.ops = &batch_query_ops;
515 
516    query->num_counters = num_queries;
517 
518    /* Collect selectors per group */
519    for (i = 0; i < num_queries; ++i) {
520       unsigned sub_gid;
521 
522       if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
523          goto error;
524 
525       block =
526          ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
527       if (!block)
528          goto error;
529 
530       sub_gid = sub_index / block->b->selectors;
531       sub_index = sub_index % block->b->selectors;
532 
533       group = get_group_state(screen, query, block, sub_gid);
534       if (!group)
535          goto error;
536 
537       if (group->num_counters >= block->b->b->num_counters) {
538          fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
539          goto error;
540       }
541       group->selectors[group->num_counters] = sub_index;
542       ++group->num_counters;
543    }
544 
545    /* Compute result bases and CS size per group */
546    query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
547    query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
548 
549    i = 0;
550    for (group = query->groups; group; group = group->next) {
551       struct ac_pc_block *block = group->block;
552       unsigned read_dw;
553       unsigned instances = 1;
554 
555       if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
556          instances = screen->info.max_se;
557       if (group->instance < 0)
558          instances *= block->num_instances;
559 
560       group->result_base = i;
561       query->result_size += sizeof(uint64_t) * instances * group->num_counters;
562       i += instances * group->num_counters;
563 
564       read_dw = 6 * group->num_counters;
565       query->b.num_cs_dw_suspend += instances * read_dw;
566       query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
567    }
568 
569    if (query->shaders) {
570       if (query->shaders == AC_PC_SHADERS_WINDOWING)
571          query->shaders = 0xffffffff;
572    }
573 
574    /* Map user-supplied query array to result indices */
575    query->counters = CALLOC(num_queries, sizeof(*query->counters));
576    for (i = 0; i < num_queries; ++i) {
577       struct si_query_counter *counter = &query->counters[i];
578       struct ac_pc_block *block;
579 
580       block =
581          ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
582 
583       sub_gid = sub_index / block->b->selectors;
584       sub_index = sub_index % block->b->selectors;
585 
586       group = get_group_state(screen, query, block, sub_gid);
587       assert(group != NULL);
588 
589       for (j = 0; j < group->num_counters; ++j) {
590          if (group->selectors[j] == sub_index)
591             break;
592       }
593 
594       counter->base = group->result_base + j;
595       counter->stride = group->num_counters;
596 
597       counter->qwords = 1;
598       if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
599          counter->qwords = screen->info.max_se;
600       if (group->instance < 0)
601          counter->qwords *= block->num_instances;
602    }
603 
604    return (struct pipe_query *)query;
605 
606 error:
607    si_pc_query_destroy((struct si_context *)ctx, &query->b);
608    return NULL;
609 }
610 
si_get_perfcounter_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_info * info)611 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
612                             struct pipe_driver_query_info *info)
613 {
614    struct si_perfcounters *pc = screen->perfcounters;
615    struct ac_pc_block *block;
616    unsigned base_gid, sub;
617 
618    if (!pc)
619       return 0;
620 
621    if (!info) {
622       unsigned bid, num_queries = 0;
623 
624       for (bid = 0; bid < pc->base.num_blocks; ++bid) {
625          num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
626       }
627 
628       return num_queries;
629    }
630 
631    block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
632    if (!block)
633       return 0;
634 
635    if (!block->selector_names) {
636       if (!ac_init_block_names(&screen->info, &pc->base, block))
637          return 0;
638    }
639    info->name = block->selector_names + sub * block->selector_name_stride;
640    info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
641    info->max_value.u64 = 0;
642    info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
643    info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
644    info->group_id = base_gid + sub / block->b->selectors;
645    info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
646    if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
647       info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
648    return 1;
649 }
650 
si_get_perfcounter_group_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)651 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
652                                   struct pipe_driver_query_group_info *info)
653 {
654    struct si_perfcounters *pc = screen->perfcounters;
655    struct ac_pc_block *block;
656 
657    if (!pc)
658       return 0;
659 
660    if (!info)
661       return pc->base.num_groups;
662 
663    block = ac_lookup_group(&pc->base, &index);
664    if (!block)
665       return 0;
666 
667    if (!block->group_names) {
668       if (!ac_init_block_names(&screen->info, &pc->base, block))
669          return 0;
670    }
671    info->name = block->group_names + index * block->group_name_stride;
672    info->num_queries = block->b->selectors;
673    info->max_active_queries = block->b->b->num_counters;
674    return 1;
675 }
676 
si_destroy_perfcounters(struct si_screen * screen)677 void si_destroy_perfcounters(struct si_screen *screen)
678 {
679    struct si_perfcounters *pc = screen->perfcounters;
680 
681    if (!pc)
682       return;
683 
684    ac_destroy_perfcounters(&pc->base);
685    FREE(pc);
686    screen->perfcounters = NULL;
687 }
688 
si_init_perfcounters(struct si_screen * screen)689 void si_init_perfcounters(struct si_screen *screen)
690 {
691    bool separate_se, separate_instance;
692 
693    separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
694    separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
695 
696    screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
697    if (!screen->perfcounters)
698       return;
699 
700    screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
701    screen->perfcounters->num_instance_cs_dwords = 3;
702 
703    if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
704                              &screen->perfcounters->base)) {
705       si_destroy_perfcounters(screen);
706    }
707 }
708 
709 static bool
si_spm_init_bo(struct si_context * sctx)710 si_spm_init_bo(struct si_context *sctx)
711 {
712    struct radeon_winsys *ws = sctx->ws;
713    uint64_t size = 32 * 1024 * 1024; /* Default to 32MB. */
714 
715    sctx->spm.buffer_size = size;
716    sctx->spm.sample_interval = 4096; /* Default to 4096 clk. */
717 
718    sctx->spm.bo = ws->buffer_create(
719       ws, size, 4096,
720       RADEON_DOMAIN_VRAM,
721       RADEON_FLAG_NO_INTERPROCESS_SHARING |
722          RADEON_FLAG_GTT_WC |
723          RADEON_FLAG_NO_SUBALLOC);
724 
725    return sctx->spm.bo != NULL;
726 }
727 
728 
729 static void
si_emit_spm_counters(struct si_context * sctx,struct radeon_cmdbuf * cs)730 si_emit_spm_counters(struct si_context *sctx, struct radeon_cmdbuf *cs)
731 {
732    struct ac_spm *spm = &sctx->spm;
733 
734    radeon_begin(cs);
735 
736    for (uint32_t instance = 0; instance < ARRAY_SIZE(spm->sqg); instance++) {
737       uint32_t num_counters = spm->sqg[instance].num_counters;
738 
739       if (!num_counters)
740          continue;
741 
742       radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
743                              S_030800_SH_BROADCAST_WRITES(1) |
744                              S_030800_INSTANCE_BROADCAST_WRITES(1) |
745                              S_030800_SE_INDEX(instance));
746 
747       for (uint32_t b = 0; b < num_counters; b++) {
748          const struct ac_spm_counter_select *cntr_sel = &spm->sqg[instance].counters[b];
749          uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT;
750 
751          radeon_set_uconfig_reg_seq(reg_base + b * 4, 1);
752          radeon_emit(cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */
753       }
754    }
755 
756    for (uint32_t b = 0; b < spm->num_block_sel; b++) {
757       struct ac_spm_block_select *block_sel = &spm->block_sel[b];
758       struct ac_pc_block_base *regs = block_sel->b->b->b;
759 
760       for (unsigned i = 0; i < block_sel->num_instances; i++) {
761          struct ac_spm_block_instance *block_instance = &block_sel->instances[i];
762 
763          radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, block_instance->grbm_gfx_index);
764 
765          for (unsigned c = 0; c < block_instance->num_counters; c++) {
766             const struct ac_spm_counter_select *cntr_sel = &block_instance->counters[c];
767 
768             if (!cntr_sel->active)
769                continue;
770 
771             radeon_set_uconfig_reg_seq(regs->select0[c], 1);
772             radeon_emit(cntr_sel->sel0);
773 
774             radeon_set_uconfig_reg_seq(regs->select1[c], 1);
775             radeon_emit(cntr_sel->sel1);
776          }
777       }
778    }
779 
780    /* Restore global broadcasting. */
781    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
782                           S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
783                           S_030800_INSTANCE_BROADCAST_WRITES(1));
784 
785    radeon_end();
786 }
787 
788 #define SPM_RING_BASE_ALIGN 32
789 
790 void
si_emit_spm_setup(struct si_context * sctx,struct radeon_cmdbuf * cs)791 si_emit_spm_setup(struct si_context *sctx, struct radeon_cmdbuf *cs)
792 {
793    struct ac_spm *spm = &sctx->spm;
794    uint64_t va = sctx->screen->ws->buffer_get_virtual_address(spm->bo);
795    uint64_t ring_size = spm->buffer_size;
796 
797    /* It's required that the ring VA and the size are correctly aligned. */
798    assert(!(va & (SPM_RING_BASE_ALIGN - 1)));
799    assert(!(ring_size & (SPM_RING_BASE_ALIGN - 1)));
800    assert(spm->sample_interval >= 32);
801 
802    radeon_begin(cs);
803 
804    /* Configure the SPM ring buffer. */
805    radeon_set_uconfig_reg(R_037200_RLC_SPM_PERFMON_CNTL,
806                           S_037200_PERFMON_RING_MODE(0) | /* no stall and no interrupt on overflow */
807                           S_037200_PERFMON_SAMPLE_INTERVAL(spm->sample_interval)); /* in sclk */
808    radeon_set_uconfig_reg(R_037204_RLC_SPM_PERFMON_RING_BASE_LO, va);
809    radeon_set_uconfig_reg(R_037208_RLC_SPM_PERFMON_RING_BASE_HI,
810                           S_037208_RING_BASE_HI(va >> 32));
811    radeon_set_uconfig_reg(R_03720C_RLC_SPM_PERFMON_RING_SIZE, ring_size);
812 
813    /* Configure the muxsel. */
814    uint32_t total_muxsel_lines = 0;
815    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
816       total_muxsel_lines += spm->num_muxsel_lines[s];
817    }
818 
819    radeon_set_uconfig_reg(R_03726C_RLC_SPM_ACCUM_MODE, 0);
820    radeon_set_uconfig_reg(R_037210_RLC_SPM_PERFMON_SEGMENT_SIZE, 0);
821    radeon_set_uconfig_reg(R_03727C_RLC_SPM_PERFMON_SE3TO0_SEGMENT_SIZE,
822                           S_03727C_SE0_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE0]) |
823                           S_03727C_SE1_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE1]) |
824                           S_03727C_SE2_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE2]) |
825                           S_03727C_SE3_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE3]));
826    radeon_set_uconfig_reg(R_037280_RLC_SPM_PERFMON_GLB_SEGMENT_SIZE,
827                           S_037280_PERFMON_SEGMENT_SIZE(total_muxsel_lines) |
828                           S_037280_GLOBAL_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_GLOBAL]));
829 
830    /* Upload each muxsel ram to the RLC. */
831    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
832       unsigned rlc_muxsel_addr, rlc_muxsel_data;
833       unsigned grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1) |
834                                 S_030800_INSTANCE_BROADCAST_WRITES(1);
835 
836       if (!spm->num_muxsel_lines[s])
837          continue;
838 
839       if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
840          grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
841 
842          rlc_muxsel_addr = R_037224_RLC_SPM_GLOBAL_MUXSEL_ADDR;
843          rlc_muxsel_data = R_037228_RLC_SPM_GLOBAL_MUXSEL_DATA;
844       } else {
845          grbm_gfx_index |= S_030800_SE_INDEX(s);
846 
847          rlc_muxsel_addr = R_03721C_RLC_SPM_SE_MUXSEL_ADDR;
848          rlc_muxsel_data = R_037220_RLC_SPM_SE_MUXSEL_DATA;
849       }
850 
851       radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, grbm_gfx_index);
852 
853       for (unsigned l = 0; l < spm->num_muxsel_lines[s]; l++) {
854          uint32_t *data = (uint32_t *)spm->muxsel_lines[s][l].muxsel;
855 
856          /* Select MUXSEL_ADDR to point to the next muxsel. */
857          radeon_set_uconfig_reg(rlc_muxsel_addr, l * AC_SPM_MUXSEL_LINE_SIZE);
858 
859          /* Write the muxsel line configuration with MUXSEL_DATA. */
860          radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + AC_SPM_MUXSEL_LINE_SIZE, 0));
861          radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) |
862                      S_370_WR_CONFIRM(1) |
863                      S_370_ENGINE_SEL(V_370_ME) |
864                      S_370_WR_ONE_ADDR(1));
865          radeon_emit(rlc_muxsel_data >> 2);
866          radeon_emit(0);
867          radeon_emit_array(data, AC_SPM_MUXSEL_LINE_SIZE);
868       }
869    }
870    radeon_end();
871 
872    /* Select SPM counters. */
873    si_emit_spm_counters(sctx, cs);
874 }
875 
876 bool
si_spm_init(struct si_context * sctx)877 si_spm_init(struct si_context *sctx)
878 {
879    const struct radeon_info *info = &sctx->screen->info;
880 
881    sctx->screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
882    sctx->screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(sctx->screen);
883    sctx->screen->perfcounters->num_instance_cs_dwords = 3;
884 
885    struct ac_perfcounters *pc = &sctx->screen->perfcounters->base;
886 
887    if (!ac_init_perfcounters(info, false, false, pc))
888       return false;
889 
890    if (!ac_init_spm(info, pc, &sctx->spm))
891       return false;
892 
893    if (!si_spm_init_bo(sctx))
894       return false;
895 
896    return true;
897 }
898 
899 void
si_spm_finish(struct si_context * sctx)900 si_spm_finish(struct si_context *sctx)
901 {
902    struct pb_buffer_lean *bo = sctx->spm.bo;
903    radeon_bo_reference(sctx->screen->ws, &bo, NULL);
904 
905    ac_destroy_spm(&sctx->spm);
906 }
907