• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "util/u_memory.h"
28 
29 #include "ac_perfcounter.h"
30 
31 struct si_query_group {
32    struct si_query_group *next;
33    struct ac_pc_block *block;
34    unsigned sub_gid;     /* only used during init */
35    unsigned result_base; /* only used during init */
36    int se;
37    int instance;
38    unsigned num_counters;
39    unsigned selectors[AC_QUERY_MAX_COUNTERS];
40 };
41 
42 struct si_query_counter {
43    unsigned base;
44    unsigned qwords;
45    unsigned stride; /* in uint64s */
46 };
47 
48 struct si_query_pc {
49    struct si_query b;
50    struct si_query_buffer buffer;
51 
52    /* Size of the results in memory, in bytes. */
53    unsigned result_size;
54 
55    unsigned shaders;
56    unsigned num_counters;
57    struct si_query_counter *counters;
58    struct si_query_group *groups;
59 };
60 
si_pc_emit_instance(struct si_context * sctx,int se,int instance)61 static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
62 {
63    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
64    unsigned value = S_030800_SH_BROADCAST_WRITES(1);
65 
66    if (se >= 0) {
67       value |= S_030800_SE_INDEX(se);
68    } else {
69       value |= S_030800_SE_BROADCAST_WRITES(1);
70    }
71 
72    if (sctx->gfx_level >= GFX10) {
73       /* TODO: Expose counters from each shader array separately if needed. */
74       value |= S_030800_SA_BROADCAST_WRITES(1);
75    }
76 
77    if (instance >= 0) {
78       value |= S_030800_INSTANCE_INDEX(instance);
79    } else {
80       value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
81    }
82 
83    radeon_begin(cs);
84    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
85    radeon_end();
86 }
87 
si_pc_emit_shaders(struct radeon_cmdbuf * cs,unsigned shaders)88 void si_pc_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
89 {
90    radeon_begin(cs);
91    radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
92    radeon_emit(shaders & 0x7f);
93    radeon_emit(0xffffffff);
94    radeon_end();
95 }
96 
si_pc_emit_select(struct si_context * sctx,struct ac_pc_block * block,unsigned count,unsigned * selectors)97 static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
98                               unsigned *selectors)
99 {
100    struct ac_pc_block_base *regs = block->b->b;
101    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
102    unsigned idx;
103 
104    assert(count <= regs->num_counters);
105 
106    /* Fake counters. */
107    if (!regs->select0)
108       return;
109 
110    radeon_begin(cs);
111 
112    for (idx = 0; idx < count; ++idx) {
113       radeon_set_uconfig_reg_seq(regs->select0[idx], 1, false);
114       radeon_emit(selectors[idx] | regs->select_or);
115    }
116 
117    for (idx = 0; idx < regs->num_spm_counters; idx++) {
118       radeon_set_uconfig_reg_seq(regs->select1[idx], 1, false);
119       radeon_emit(0);
120    }
121 
122    radeon_end();
123 }
124 
si_pc_emit_start(struct si_context * sctx,struct si_resource * buffer,uint64_t va)125 static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
126 {
127    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
128 
129    si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
130                    COPY_DATA_IMM, NULL, 1);
131 
132    radeon_begin(cs);
133    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
134                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
135    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
136    radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
137    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
138                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
139    radeon_end();
140 }
141 
142 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
143  * do it again in here. */
si_pc_emit_stop(struct si_context * sctx,struct si_resource * buffer,uint64_t va)144 static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
145 {
146    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
147 
148    si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
149                      EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
150    si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
151 
152    radeon_begin(cs);
153    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
154    radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
155 
156    if (!sctx->screen->info.never_send_perfcounter_stop) {
157       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
158       radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
159    }
160 
161    radeon_set_uconfig_reg(
162       R_036020_CP_PERFMON_CNTL,
163       S_036020_PERFMON_STATE(sctx->screen->info.never_stop_sq_perf_counters ?
164                                 V_036020_CP_PERFMON_STATE_START_COUNTING :
165                                 V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
166       S_036020_PERFMON_SAMPLE_ENABLE(1));
167    radeon_end();
168 }
169 
si_pc_emit_spm_start(struct radeon_cmdbuf * cs)170 void si_pc_emit_spm_start(struct radeon_cmdbuf *cs)
171 {
172    radeon_begin(cs);
173 
174    /* Start SPM counters. */
175    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
176                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
177                              S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
178    /* Start windowed performance counters. */
179    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
180    radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
181    radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(1));
182 
183    radeon_end();
184 }
185 
si_pc_emit_spm_stop(struct radeon_cmdbuf * cs,bool never_stop_sq_perf_counters,bool never_send_perfcounter_stop)186 void si_pc_emit_spm_stop(struct radeon_cmdbuf *cs, bool never_stop_sq_perf_counters,
187                          bool never_send_perfcounter_stop)
188 {
189    radeon_begin(cs);
190 
191    /* Stop windowed performance counters. */
192    if (!never_send_perfcounter_stop) {
193       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
194       radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
195    }
196 
197    radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(0));
198 
199    /* Stop SPM counters. */
200    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
201                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
202                           S_036020_SPM_PERFMON_STATE(never_stop_sq_perf_counters ?
203                              V_036020_STRM_PERFMON_STATE_START_COUNTING :
204                              V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
205 
206    radeon_end();
207 }
208 
si_pc_emit_spm_reset(struct radeon_cmdbuf * cs)209 void si_pc_emit_spm_reset(struct radeon_cmdbuf *cs)
210 {
211    radeon_begin(cs);
212    radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
213                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
214                           S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
215    radeon_end();
216 }
217 
218 
si_pc_emit_read(struct si_context * sctx,struct ac_pc_block * block,unsigned count,uint64_t va)219 static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, unsigned count,
220                             uint64_t va)
221 {
222    struct ac_pc_block_base *regs = block->b->b;
223    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
224    unsigned idx;
225    unsigned reg = regs->counter0_lo;
226    unsigned reg_delta = 8;
227 
228    radeon_begin(cs);
229 
230    if (regs->select0) {
231       for (idx = 0; idx < count; ++idx) {
232          if (regs->counters)
233             reg = regs->counters[idx];
234 
235          radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
236          radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
237                             COPY_DATA_COUNT_SEL); /* 64 bits */
238          radeon_emit(reg >> 2);
239          radeon_emit(0); /* unused */
240          radeon_emit(va);
241          radeon_emit(va >> 32);
242          va += sizeof(uint64_t);
243          reg += reg_delta;
244       }
245    } else {
246       /* Fake counters. */
247       for (idx = 0; idx < count; ++idx) {
248          radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
249          radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
250                      COPY_DATA_COUNT_SEL);
251          radeon_emit(0); /* immediate */
252          radeon_emit(0);
253          radeon_emit(va);
254          radeon_emit(va >> 32);
255          va += sizeof(uint64_t);
256       }
257    }
258    radeon_end();
259 }
260 
si_pc_query_destroy(struct si_context * sctx,struct si_query * squery)261 static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
262 {
263    struct si_query_pc *query = (struct si_query_pc *)squery;
264 
265    while (query->groups) {
266       struct si_query_group *group = query->groups;
267       query->groups = group->next;
268       FREE(group);
269    }
270 
271    FREE(query->counters);
272 
273    si_query_buffer_destroy(sctx->screen, &query->buffer);
274    FREE(query);
275 }
276 
si_inhibit_clockgating(struct si_context * sctx,struct radeon_cmdbuf * cs,bool inhibit)277 void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit)
278 {
279    if (sctx->gfx_level >= GFX11)
280       return;
281 
282    radeon_begin(&sctx->gfx_cs);
283 
284    if (sctx->gfx_level >= GFX10) {
285       radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
286                              S_037390_PERFMON_CLOCK_STATE(inhibit));
287    } else if (sctx->gfx_level >= GFX8) {
288       radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
289                              S_0372FC_PERFMON_CLOCK_STATE(inhibit));
290    }
291    radeon_end();
292 }
293 
si_pc_query_resume(struct si_context * sctx,struct si_query * squery)294 static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
295 /*
296                                    struct si_query_hw *hwquery,
297                                    struct si_resource *buffer, uint64_t va)*/
298 {
299    struct si_query_pc *query = (struct si_query_pc *)squery;
300    int current_se = -1;
301    int current_instance = -1;
302 
303    if (!si_query_buffer_alloc(sctx, &query->buffer, NULL, query->result_size))
304       return;
305    si_need_gfx_cs_space(sctx, 0);
306 
307    if (query->shaders)
308       si_pc_emit_shaders(&sctx->gfx_cs, query->shaders);
309 
310    si_inhibit_clockgating(sctx, &sctx->gfx_cs, true);
311 
312    for (struct si_query_group *group = query->groups; group; group = group->next) {
313       struct ac_pc_block *block = group->block;
314 
315       if (group->se != current_se || group->instance != current_instance) {
316          current_se = group->se;
317          current_instance = group->instance;
318          si_pc_emit_instance(sctx, group->se, group->instance);
319       }
320 
321       si_pc_emit_select(sctx, block, group->num_counters, group->selectors);
322    }
323 
324    if (current_se != -1 || current_instance != -1)
325       si_pc_emit_instance(sctx, -1, -1);
326 
327    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
328    si_pc_emit_start(sctx, query->buffer.buf, va);
329 }
330 
si_pc_query_suspend(struct si_context * sctx,struct si_query * squery)331 static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery)
332 {
333    struct si_query_pc *query = (struct si_query_pc *)squery;
334 
335    if (!query->buffer.buf)
336       return;
337 
338    uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
339    query->buffer.results_end += query->result_size;
340 
341    si_pc_emit_stop(sctx, query->buffer.buf, va);
342 
343    for (struct si_query_group *group = query->groups; group; group = group->next) {
344       struct ac_pc_block *block = group->block;
345       unsigned se = group->se >= 0 ? group->se : 0;
346       unsigned se_end = se + 1;
347 
348       if ((block->b->b->flags & AC_PC_BLOCK_SE) && (group->se < 0))
349          se_end = sctx->screen->info.max_se;
350 
351       do {
352          unsigned instance = group->instance >= 0 ? group->instance : 0;
353 
354          do {
355             si_pc_emit_instance(sctx, se, instance);
356             si_pc_emit_read(sctx, block, group->num_counters, va);
357             va += sizeof(uint64_t) * group->num_counters;
358          } while (group->instance < 0 && ++instance < block->num_instances);
359       } while (++se < se_end);
360    }
361 
362    si_pc_emit_instance(sctx, -1, -1);
363 
364    si_inhibit_clockgating(sctx, &sctx->gfx_cs, false);
365 }
366 
si_pc_query_begin(struct si_context * ctx,struct si_query * squery)367 static bool si_pc_query_begin(struct si_context *ctx, struct si_query *squery)
368 {
369    struct si_query_pc *query = (struct si_query_pc *)squery;
370 
371    si_query_buffer_reset(ctx, &query->buffer);
372 
373    list_addtail(&query->b.active_list, &ctx->active_queries);
374    ctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
375 
376    si_pc_query_resume(ctx, squery);
377 
378    return true;
379 }
380 
si_pc_query_end(struct si_context * ctx,struct si_query * squery)381 static bool si_pc_query_end(struct si_context *ctx, struct si_query *squery)
382 {
383    struct si_query_pc *query = (struct si_query_pc *)squery;
384 
385    si_pc_query_suspend(ctx, squery);
386 
387    list_del(&squery->active_list);
388    ctx->num_cs_dw_queries_suspend -= squery->num_cs_dw_suspend;
389 
390    return query->buffer.buf != NULL;
391 }
392 
si_pc_query_add_result(struct si_query_pc * query,void * buffer,union pipe_query_result * result)393 static void si_pc_query_add_result(struct si_query_pc *query, void *buffer,
394                                    union pipe_query_result *result)
395 {
396    uint64_t *results = buffer;
397    unsigned i, j;
398 
399    for (i = 0; i < query->num_counters; ++i) {
400       struct si_query_counter *counter = &query->counters[i];
401 
402       for (j = 0; j < counter->qwords; ++j) {
403          uint32_t value = results[counter->base + j * counter->stride];
404          result->batch[i].u64 += value;
405       }
406    }
407 }
408 
si_pc_query_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)409 static bool si_pc_query_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
410                                    union pipe_query_result *result)
411 {
412    struct si_query_pc *query = (struct si_query_pc *)squery;
413 
414    memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
415 
416    for (struct si_query_buffer *qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
417       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
418       unsigned results_base = 0;
419       void *map;
420 
421       if (squery->b.flushed)
422          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
423       else
424          map = si_buffer_map(sctx, qbuf->buf, usage);
425 
426       if (!map)
427          return false;
428 
429       while (results_base != qbuf->results_end) {
430          si_pc_query_add_result(query, map + results_base, result);
431          results_base += query->result_size;
432       }
433    }
434 
435    return true;
436 }
437 
438 static const struct si_query_ops batch_query_ops = {
439    .destroy = si_pc_query_destroy,
440    .begin = si_pc_query_begin,
441    .end = si_pc_query_end,
442    .get_result = si_pc_query_get_result,
443 
444    .suspend = si_pc_query_suspend,
445    .resume = si_pc_query_resume,
446 };
447 
get_group_state(struct si_screen * screen,struct si_query_pc * query,struct ac_pc_block * block,unsigned sub_gid)448 static struct si_query_group *get_group_state(struct si_screen *screen, struct si_query_pc *query,
449                                               struct ac_pc_block *block, unsigned sub_gid)
450 {
451    struct si_perfcounters *pc = screen->perfcounters;
452    struct si_query_group *group = query->groups;
453 
454    while (group) {
455       if (group->block == block && group->sub_gid == sub_gid)
456          return group;
457       group = group->next;
458    }
459 
460    group = CALLOC_STRUCT(si_query_group);
461    if (!group)
462       return NULL;
463 
464    group->block = block;
465    group->sub_gid = sub_gid;
466 
467    if (block->b->b->flags & AC_PC_BLOCK_SHADER) {
468       unsigned sub_gids = block->num_instances;
469       unsigned shader_id;
470       unsigned shaders;
471       unsigned query_shaders;
472 
473       if (ac_pc_block_has_per_se_groups(&pc->base, block))
474          sub_gids = sub_gids * screen->info.max_se;
475       shader_id = sub_gid / sub_gids;
476       sub_gid = sub_gid % sub_gids;
477 
478       shaders = ac_pc_shader_type_bits[shader_id];
479 
480       query_shaders = query->shaders & ~AC_PC_SHADERS_WINDOWING;
481       if (query_shaders && query_shaders != shaders) {
482          fprintf(stderr, "si_perfcounter: incompatible shader groups\n");
483          FREE(group);
484          return NULL;
485       }
486       query->shaders = shaders;
487    }
488 
489    if (block->b->b->flags & AC_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
490       // A non-zero value in query->shaders ensures that the shader
491       // masking is reset unless the user explicitly requests one.
492       query->shaders = AC_PC_SHADERS_WINDOWING;
493    }
494 
495    if (ac_pc_block_has_per_se_groups(&pc->base, block)) {
496       group->se = sub_gid / block->num_instances;
497       sub_gid = sub_gid % block->num_instances;
498    } else {
499       group->se = -1;
500    }
501 
502    if (ac_pc_block_has_per_instance_groups(&pc->base, block)) {
503       group->instance = sub_gid;
504    } else {
505       group->instance = -1;
506    }
507 
508    group->next = query->groups;
509    query->groups = group;
510 
511    return group;
512 }
513 
si_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)514 struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
515                                          unsigned *query_types)
516 {
517    struct si_screen *screen = (struct si_screen *)ctx->screen;
518    struct si_perfcounters *pc = screen->perfcounters;
519    struct ac_pc_block *block;
520    struct si_query_group *group;
521    struct si_query_pc *query;
522    unsigned base_gid, sub_gid, sub_index;
523    unsigned i, j;
524 
525    if (!pc)
526       return NULL;
527 
528    query = CALLOC_STRUCT(si_query_pc);
529    if (!query)
530       return NULL;
531 
532    query->b.ops = &batch_query_ops;
533 
534    query->num_counters = num_queries;
535 
536    /* Collect selectors per group */
537    for (i = 0; i < num_queries; ++i) {
538       unsigned sub_gid;
539 
540       if (query_types[i] < SI_QUERY_FIRST_PERFCOUNTER)
541          goto error;
542 
543       block =
544          ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
545       if (!block)
546          goto error;
547 
548       sub_gid = sub_index / block->b->selectors;
549       sub_index = sub_index % block->b->selectors;
550 
551       group = get_group_state(screen, query, block, sub_gid);
552       if (!group)
553          goto error;
554 
555       if (group->num_counters >= block->b->b->num_counters) {
556          fprintf(stderr, "perfcounter group %s: too many selected\n", block->b->b->name);
557          goto error;
558       }
559       group->selectors[group->num_counters] = sub_index;
560       ++group->num_counters;
561    }
562 
563    /* Compute result bases and CS size per group */
564    query->b.num_cs_dw_suspend = pc->num_stop_cs_dwords;
565    query->b.num_cs_dw_suspend += pc->num_instance_cs_dwords;
566 
567    i = 0;
568    for (group = query->groups; group; group = group->next) {
569       struct ac_pc_block *block = group->block;
570       unsigned read_dw;
571       unsigned instances = 1;
572 
573       if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
574          instances = screen->info.max_se;
575       if (group->instance < 0)
576          instances *= block->num_instances;
577 
578       group->result_base = i;
579       query->result_size += sizeof(uint64_t) * instances * group->num_counters;
580       i += instances * group->num_counters;
581 
582       read_dw = 6 * group->num_counters;
583       query->b.num_cs_dw_suspend += instances * read_dw;
584       query->b.num_cs_dw_suspend += instances * pc->num_instance_cs_dwords;
585    }
586 
587    if (query->shaders) {
588       if (query->shaders == AC_PC_SHADERS_WINDOWING)
589          query->shaders = 0xffffffff;
590    }
591 
592    /* Map user-supplied query array to result indices */
593    query->counters = CALLOC(num_queries, sizeof(*query->counters));
594    for (i = 0; i < num_queries; ++i) {
595       struct si_query_counter *counter = &query->counters[i];
596       struct ac_pc_block *block;
597 
598       block =
599          ac_lookup_counter(&pc->base, query_types[i] - SI_QUERY_FIRST_PERFCOUNTER, &base_gid, &sub_index);
600 
601       sub_gid = sub_index / block->b->selectors;
602       sub_index = sub_index % block->b->selectors;
603 
604       group = get_group_state(screen, query, block, sub_gid);
605       assert(group != NULL);
606 
607       for (j = 0; j < group->num_counters; ++j) {
608          if (group->selectors[j] == sub_index)
609             break;
610       }
611 
612       counter->base = group->result_base + j;
613       counter->stride = group->num_counters;
614 
615       counter->qwords = 1;
616       if ((block->b->b->flags & AC_PC_BLOCK_SE) && group->se < 0)
617          counter->qwords = screen->info.max_se;
618       if (group->instance < 0)
619          counter->qwords *= block->num_instances;
620    }
621 
622    return (struct pipe_query *)query;
623 
624 error:
625    si_pc_query_destroy((struct si_context *)ctx, &query->b);
626    return NULL;
627 }
628 
si_get_perfcounter_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_info * info)629 int si_get_perfcounter_info(struct si_screen *screen, unsigned index,
630                             struct pipe_driver_query_info *info)
631 {
632    struct si_perfcounters *pc = screen->perfcounters;
633    struct ac_pc_block *block;
634    unsigned base_gid, sub;
635 
636    if (!pc)
637       return 0;
638 
639    if (!info) {
640       unsigned bid, num_queries = 0;
641 
642       for (bid = 0; bid < pc->base.num_blocks; ++bid) {
643          num_queries += pc->base.blocks[bid].b->selectors * pc->base.blocks[bid].num_groups;
644       }
645 
646       return num_queries;
647    }
648 
649    block = ac_lookup_counter(&pc->base, index, &base_gid, &sub);
650    if (!block)
651       return 0;
652 
653    if (!block->selector_names) {
654       if (!ac_init_block_names(&screen->info, &pc->base, block))
655          return 0;
656    }
657    info->name = block->selector_names + sub * block->selector_name_stride;
658    info->query_type = SI_QUERY_FIRST_PERFCOUNTER + index;
659    info->max_value.u64 = 0;
660    info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
661    info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
662    info->group_id = base_gid + sub / block->b->selectors;
663    info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
664    if (sub > 0 && sub + 1 < block->b->selectors * block->num_groups)
665       info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
666    return 1;
667 }
668 
si_get_perfcounter_group_info(struct si_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)669 int si_get_perfcounter_group_info(struct si_screen *screen, unsigned index,
670                                   struct pipe_driver_query_group_info *info)
671 {
672    struct si_perfcounters *pc = screen->perfcounters;
673    struct ac_pc_block *block;
674 
675    if (!pc)
676       return 0;
677 
678    if (!info)
679       return pc->base.num_groups;
680 
681    block = ac_lookup_group(&pc->base, &index);
682    if (!block)
683       return 0;
684 
685    if (!block->group_names) {
686       if (!ac_init_block_names(&screen->info, &pc->base, block))
687          return 0;
688    }
689    info->name = block->group_names + index * block->group_name_stride;
690    info->num_queries = block->b->selectors;
691    info->max_active_queries = block->b->b->num_counters;
692    return 1;
693 }
694 
si_destroy_perfcounters(struct si_screen * screen)695 void si_destroy_perfcounters(struct si_screen *screen)
696 {
697    struct si_perfcounters *pc = screen->perfcounters;
698 
699    if (!pc)
700       return;
701 
702    ac_destroy_perfcounters(&pc->base);
703    FREE(pc);
704    screen->perfcounters = NULL;
705 }
706 
si_init_perfcounters(struct si_screen * screen)707 void si_init_perfcounters(struct si_screen *screen)
708 {
709    bool separate_se, separate_instance;
710 
711    separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
712    separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
713 
714    screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
715    if (!screen->perfcounters)
716       return;
717 
718    screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen);
719    screen->perfcounters->num_instance_cs_dwords = 3;
720 
721    if (!ac_init_perfcounters(&screen->info, separate_se, separate_instance,
722                              &screen->perfcounters->base)) {
723       si_destroy_perfcounters(screen);
724    }
725 }
726 
727 static bool
si_spm_init_bo(struct si_context * sctx)728 si_spm_init_bo(struct si_context *sctx)
729 {
730    struct radeon_winsys *ws = sctx->ws;
731    uint64_t size = 32 * 1024 * 1024; /* Default to 32MB. */
732 
733    sctx->spm_trace.buffer_size = size;
734    sctx->spm_trace.sample_interval = 4096; /* Default to 4096 clk. */
735 
736    sctx->spm_trace.bo = ws->buffer_create(
737       ws, size, 4096,
738       RADEON_DOMAIN_VRAM,
739       RADEON_FLAG_NO_INTERPROCESS_SHARING |
740          RADEON_FLAG_GTT_WC |
741          RADEON_FLAG_NO_SUBALLOC);
742 
743    return sctx->spm_trace.bo != NULL;
744 }
745 
746 
747 static void
si_emit_spm_counters(struct si_context * sctx,struct radeon_cmdbuf * cs)748 si_emit_spm_counters(struct si_context *sctx, struct radeon_cmdbuf *cs)
749 {
750    struct ac_spm_trace_data *spm_trace = &sctx->spm_trace;
751 
752    radeon_begin(cs);
753 
754    for (uint32_t b = 0; b < spm_trace->num_used_sq_block_sel; b++) {
755       struct ac_spm_block_select *sq_block_sel = &spm_trace->sq_block_sel[b];
756       const struct ac_spm_counter_select *cntr_sel = &sq_block_sel->counters[0];
757       uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT;
758 
759       radeon_set_uconfig_reg_seq(reg_base + b * 4, 1, false);
760       radeon_emit(cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */
761    }
762 
763    for (uint32_t b = 0; b < spm_trace->num_block_sel; b++) {
764       struct ac_spm_block_select *block_sel = &spm_trace->block_sel[b];
765       struct ac_pc_block_base *regs = block_sel->b->b->b;
766 
767       radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, block_sel->grbm_gfx_index);
768 
769       for (unsigned c = 0; c < block_sel->num_counters; c++) {
770          const struct ac_spm_counter_select *cntr_sel = &block_sel->counters[c];
771 
772          if (!cntr_sel->active)
773             continue;
774 
775          radeon_set_uconfig_reg_seq(regs->select0[c], 1, false);
776          radeon_emit(cntr_sel->sel0);
777 
778          radeon_set_uconfig_reg_seq(regs->select1[c], 1, false);
779          radeon_emit(cntr_sel->sel1);
780       }
781    }
782 
783    /* Restore global broadcasting. */
784    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
785                           S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
786                           S_030800_INSTANCE_BROADCAST_WRITES(1));
787 
788    radeon_end();
789 }
790 
791 #define SPM_RING_BASE_ALIGN 32
792 
793 void
si_emit_spm_setup(struct si_context * sctx,struct radeon_cmdbuf * cs)794 si_emit_spm_setup(struct si_context *sctx, struct radeon_cmdbuf *cs)
795 {
796    struct ac_spm_trace_data *spm_trace = &sctx->spm_trace;
797    uint64_t va = sctx->screen->ws->buffer_get_virtual_address(spm_trace->bo);
798    uint64_t ring_size = spm_trace->buffer_size;
799 
800    /* It's required that the ring VA and the size are correctly aligned. */
801    assert(!(va & (SPM_RING_BASE_ALIGN - 1)));
802    assert(!(ring_size & (SPM_RING_BASE_ALIGN - 1)));
803    assert(spm_trace->sample_interval >= 32);
804 
805    radeon_begin(cs);
806 
807    /* Configure the SPM ring buffer. */
808    radeon_set_uconfig_reg(R_037200_RLC_SPM_PERFMON_CNTL,
809                           S_037200_PERFMON_RING_MODE(0) | /* no stall and no interrupt on overflow */
810                           S_037200_PERFMON_SAMPLE_INTERVAL(spm_trace->sample_interval)); /* in sclk */
811    radeon_set_uconfig_reg(R_037204_RLC_SPM_PERFMON_RING_BASE_LO, va);
812    radeon_set_uconfig_reg(R_037208_RLC_SPM_PERFMON_RING_BASE_HI,
813                           S_037208_RING_BASE_HI(va >> 32));
814    radeon_set_uconfig_reg(R_03720C_RLC_SPM_PERFMON_RING_SIZE, ring_size);
815 
816    /* Configure the muxsel. */
817    uint32_t total_muxsel_lines = 0;
818    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
819       total_muxsel_lines += spm_trace->num_muxsel_lines[s];
820    }
821 
822    radeon_set_uconfig_reg(R_03726C_RLC_SPM_ACCUM_MODE, 0);
823    radeon_set_uconfig_reg(R_037210_RLC_SPM_PERFMON_SEGMENT_SIZE, 0);
824    radeon_set_uconfig_reg(R_03727C_RLC_SPM_PERFMON_SE3TO0_SEGMENT_SIZE,
825                           S_03727C_SE0_NUM_LINE(spm_trace->num_muxsel_lines[0]) |
826                           S_03727C_SE1_NUM_LINE(spm_trace->num_muxsel_lines[1]) |
827                           S_03727C_SE2_NUM_LINE(spm_trace->num_muxsel_lines[2]) |
828                           S_03727C_SE3_NUM_LINE(spm_trace->num_muxsel_lines[3]));
829    radeon_set_uconfig_reg(R_037280_RLC_SPM_PERFMON_GLB_SEGMENT_SIZE,
830                           S_037280_PERFMON_SEGMENT_SIZE(total_muxsel_lines) |
831                           S_037280_GLOBAL_NUM_LINE(spm_trace->num_muxsel_lines[4]));
832 
833    /* Upload each muxsel ram to the RLC. */
834    for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
835       unsigned rlc_muxsel_addr, rlc_muxsel_data;
836       unsigned grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1) |
837                                 S_030800_INSTANCE_BROADCAST_WRITES(1);
838 
839       if (!spm_trace->num_muxsel_lines[s])
840          continue;
841 
842       if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
843          grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
844 
845          rlc_muxsel_addr = R_037224_RLC_SPM_GLOBAL_MUXSEL_ADDR;
846          rlc_muxsel_data = R_037228_RLC_SPM_GLOBAL_MUXSEL_DATA;
847       } else {
848          grbm_gfx_index |= S_030800_SE_INDEX(s);
849 
850          rlc_muxsel_addr = R_03721C_RLC_SPM_SE_MUXSEL_ADDR;
851          rlc_muxsel_data = R_037220_RLC_SPM_SE_MUXSEL_DATA;
852       }
853 
854       radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, grbm_gfx_index);
855 
856       for (unsigned l = 0; l < spm_trace->num_muxsel_lines[s]; l++) {
857          uint32_t *data = (uint32_t *)spm_trace->muxsel_lines[s][l].muxsel;
858 
859          /* Select MUXSEL_ADDR to point to the next muxsel. */
860          radeon_set_uconfig_reg(rlc_muxsel_addr, l * AC_SPM_MUXSEL_LINE_SIZE);
861 
862          /* Write the muxsel line configuration with MUXSEL_DATA. */
863          radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + AC_SPM_MUXSEL_LINE_SIZE, 0));
864          radeon_emit(S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) |
865                      S_370_WR_CONFIRM(1) |
866                      S_370_ENGINE_SEL(V_370_ME) |
867                      S_370_WR_ONE_ADDR(1));
868          radeon_emit(rlc_muxsel_data >> 2);
869          radeon_emit(0);
870          radeon_emit_array(data, AC_SPM_MUXSEL_LINE_SIZE);
871       }
872    }
873    radeon_end();
874 
875    /* Select SPM counters. */
876    si_emit_spm_counters(sctx, cs);
877 }
878 
879 bool
si_spm_init(struct si_context * sctx)880 si_spm_init(struct si_context *sctx)
881 {
882    const struct radeon_info *info = &sctx->screen->info;
883 
884    sctx->screen->perfcounters = CALLOC_STRUCT(si_perfcounters);
885    sctx->screen->perfcounters->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(sctx->screen);
886    sctx->screen->perfcounters->num_instance_cs_dwords = 3;
887 
888    struct ac_perfcounters *pc = &sctx->screen->perfcounters->base;
889    struct ac_spm_counter_create_info spm_counters[] = {
890 
891       /* XXX: doesn't work */
892       {TCP, 0, 0x9},    /* Number of L2 requests. */
893       {TCP, 0, 0x12},   /* Number of L2 misses. */
894 
895       /* Scalar cache hit */
896       {SQ, 0, 0x14f},   /* Number of SCACHE hits. */
897       {SQ, 0, 0x150},   /* Number of SCACHE misses. */
898       {SQ, 0, 0x151},   /* Number of SCACHE misses duplicate. */
899 
900       /* Instruction cache hit */
901       {SQ, 0, 0x12c},   /* Number of ICACHE hits. */
902       {SQ, 0, 0x12d},   /* Number of ICACHE misses. */
903       {SQ, 0, 0x12e},   /* Number of ICACHE misses duplicate. */
904 
905       /* XXX: doesn't work */
906       {GL1C, 0, 0xe},   /* Number of GL1C requests. */
907       {GL1C, 0, 0x12},  /* Number of GL1C misses. */
908 
909       /* L2 cache hit */
910       {GL2C, 0, 0x3},   /* Number of GL2C requests. */
911       {GL2C, 0, info->gfx_level >= GFX10_3 ? 0x2b : 0x23},  /* Number of GL2C misses. */
912    };
913 
914    if (!ac_init_perfcounters(info, false, false, pc))
915       return false;
916 
917    if (!ac_init_spm(info, pc, ARRAY_SIZE(spm_counters), spm_counters, &sctx->spm_trace))
918       return false;
919 
920    if (!si_spm_init_bo(sctx))
921       return false;
922 
923    return true;
924 }
925 
926 void
si_spm_finish(struct si_context * sctx)927 si_spm_finish(struct si_context *sctx)
928 {
929    struct pb_buffer *bo = sctx->spm_trace.bo;
930    radeon_bo_reference(sctx->screen->ws, &bo, NULL);
931 
932    ac_destroy_spm(&sctx->spm_trace);
933 }
934