1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Nicolai Hähnle <nicolai.haehnle@amd.com>
25 *
26 */
27
28 #include "util/u_memory.h"
29 #include "r600_query.h"
30 #include "r600_pipe_common.h"
31 #include "r600d_common.h"
32
33 /* Max counters per HW block */
34 #define R600_QUERY_MAX_COUNTERS 16
35
36 static struct r600_perfcounter_block *
lookup_counter(struct r600_perfcounters * pc,unsigned index,unsigned * base_gid,unsigned * sub_index)37 lookup_counter(struct r600_perfcounters *pc, unsigned index,
38 unsigned *base_gid, unsigned *sub_index)
39 {
40 struct r600_perfcounter_block *block = pc->blocks;
41 unsigned bid;
42
43 *base_gid = 0;
44 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
45 unsigned total = block->num_groups * block->num_selectors;
46
47 if (index < total) {
48 *sub_index = index;
49 return block;
50 }
51
52 index -= total;
53 *base_gid += block->num_groups;
54 }
55
56 return NULL;
57 }
58
59 static struct r600_perfcounter_block *
lookup_group(struct r600_perfcounters * pc,unsigned * index)60 lookup_group(struct r600_perfcounters *pc, unsigned *index)
61 {
62 unsigned bid;
63 struct r600_perfcounter_block *block = pc->blocks;
64
65 for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
66 if (*index < block->num_groups)
67 return block;
68 *index -= block->num_groups;
69 }
70
71 return NULL;
72 }
73
74 struct r600_pc_group {
75 struct r600_pc_group *next;
76 struct r600_perfcounter_block *block;
77 unsigned sub_gid; /* only used during init */
78 unsigned result_base; /* only used during init */
79 int se;
80 int instance;
81 unsigned num_counters;
82 unsigned selectors[R600_QUERY_MAX_COUNTERS];
83 };
84
85 struct r600_pc_counter {
86 unsigned base;
87 unsigned qwords;
88 unsigned stride; /* in uint64s */
89 };
90
91 #define R600_PC_SHADERS_WINDOWING (1 << 31)
92
93 struct r600_query_pc {
94 struct r600_query_hw b;
95
96 unsigned shaders;
97 unsigned num_counters;
98 struct r600_pc_counter *counters;
99 struct r600_pc_group *groups;
100 };
101
r600_pc_query_destroy(struct r600_common_screen * rscreen,struct r600_query * rquery)102 static void r600_pc_query_destroy(struct r600_common_screen *rscreen,
103 struct r600_query *rquery)
104 {
105 struct r600_query_pc *query = (struct r600_query_pc *)rquery;
106
107 while (query->groups) {
108 struct r600_pc_group *group = query->groups;
109 query->groups = group->next;
110 FREE(group);
111 }
112
113 FREE(query->counters);
114
115 r600_query_hw_destroy(rscreen, rquery);
116 }
117
r600_pc_query_prepare_buffer(struct r600_common_screen * screen,struct r600_query_hw * hwquery,struct r600_resource * buffer)118 static bool r600_pc_query_prepare_buffer(struct r600_common_screen *screen,
119 struct r600_query_hw *hwquery,
120 struct r600_resource *buffer)
121 {
122 /* no-op */
123 return true;
124 }
125
r600_pc_query_emit_start(struct r600_common_context * ctx,struct r600_query_hw * hwquery,struct r600_resource * buffer,uint64_t va)126 static void r600_pc_query_emit_start(struct r600_common_context *ctx,
127 struct r600_query_hw *hwquery,
128 struct r600_resource *buffer, uint64_t va)
129 {
130 struct r600_perfcounters *pc = ctx->screen->perfcounters;
131 struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
132 struct r600_pc_group *group;
133 int current_se = -1;
134 int current_instance = -1;
135
136 if (query->shaders)
137 pc->emit_shaders(ctx, query->shaders);
138
139 for (group = query->groups; group; group = group->next) {
140 struct r600_perfcounter_block *block = group->block;
141
142 if (group->se != current_se || group->instance != current_instance) {
143 current_se = group->se;
144 current_instance = group->instance;
145 pc->emit_instance(ctx, group->se, group->instance);
146 }
147
148 pc->emit_select(ctx, block, group->num_counters, group->selectors);
149 }
150
151 if (current_se != -1 || current_instance != -1)
152 pc->emit_instance(ctx, -1, -1);
153
154 pc->emit_start(ctx, buffer, va);
155 }
156
r600_pc_query_emit_stop(struct r600_common_context * ctx,struct r600_query_hw * hwquery,struct r600_resource * buffer,uint64_t va)157 static void r600_pc_query_emit_stop(struct r600_common_context *ctx,
158 struct r600_query_hw *hwquery,
159 struct r600_resource *buffer, uint64_t va)
160 {
161 struct r600_perfcounters *pc = ctx->screen->perfcounters;
162 struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
163 struct r600_pc_group *group;
164
165 pc->emit_stop(ctx, buffer, va);
166
167 for (group = query->groups; group; group = group->next) {
168 struct r600_perfcounter_block *block = group->block;
169 unsigned se = group->se >= 0 ? group->se : 0;
170 unsigned se_end = se + 1;
171
172 if ((block->flags & R600_PC_BLOCK_SE) && (group->se < 0))
173 se_end = ctx->screen->info.max_se;
174
175 do {
176 unsigned instance = group->instance >= 0 ? group->instance : 0;
177
178 do {
179 pc->emit_instance(ctx, se, instance);
180 pc->emit_read(ctx, block,
181 group->num_counters, group->selectors,
182 buffer, va);
183 va += sizeof(uint64_t) * group->num_counters;
184 } while (group->instance < 0 && ++instance < block->num_instances);
185 } while (++se < se_end);
186 }
187
188 pc->emit_instance(ctx, -1, -1);
189 }
190
r600_pc_query_clear_result(struct r600_query_hw * hwquery,union pipe_query_result * result)191 static void r600_pc_query_clear_result(struct r600_query_hw *hwquery,
192 union pipe_query_result *result)
193 {
194 struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
195
196 memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
197 }
198
r600_pc_query_add_result(struct r600_common_screen * rscreen,struct r600_query_hw * hwquery,void * buffer,union pipe_query_result * result)199 static void r600_pc_query_add_result(struct r600_common_screen *rscreen,
200 struct r600_query_hw *hwquery,
201 void *buffer,
202 union pipe_query_result *result)
203 {
204 struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
205 uint64_t *results = buffer;
206 unsigned i, j;
207
208 for (i = 0; i < query->num_counters; ++i) {
209 struct r600_pc_counter *counter = &query->counters[i];
210
211 for (j = 0; j < counter->qwords; ++j) {
212 uint32_t value = results[counter->base + j * counter->stride];
213 result->batch[i].u64 += value;
214 }
215 }
216 }
217
218 static struct r600_query_ops batch_query_ops = {
219 .destroy = r600_pc_query_destroy,
220 .begin = r600_query_hw_begin,
221 .end = r600_query_hw_end,
222 .get_result = r600_query_hw_get_result
223 };
224
225 static struct r600_query_hw_ops batch_query_hw_ops = {
226 .prepare_buffer = r600_pc_query_prepare_buffer,
227 .emit_start = r600_pc_query_emit_start,
228 .emit_stop = r600_pc_query_emit_stop,
229 .clear_result = r600_pc_query_clear_result,
230 .add_result = r600_pc_query_add_result,
231 };
232
get_group_state(struct r600_common_screen * screen,struct r600_query_pc * query,struct r600_perfcounter_block * block,unsigned sub_gid)233 static struct r600_pc_group *get_group_state(struct r600_common_screen *screen,
234 struct r600_query_pc *query,
235 struct r600_perfcounter_block *block,
236 unsigned sub_gid)
237 {
238 struct r600_pc_group *group = query->groups;
239
240 while (group) {
241 if (group->block == block && group->sub_gid == sub_gid)
242 return group;
243 group = group->next;
244 }
245
246 group = CALLOC_STRUCT(r600_pc_group);
247 if (!group)
248 return NULL;
249
250 group->block = block;
251 group->sub_gid = sub_gid;
252
253 if (block->flags & R600_PC_BLOCK_SHADER) {
254 unsigned sub_gids = block->num_instances;
255 unsigned shader_id;
256 unsigned shaders;
257 unsigned query_shaders;
258
259 if (block->flags & R600_PC_BLOCK_SE_GROUPS)
260 sub_gids = sub_gids * screen->info.max_se;
261 shader_id = sub_gid / sub_gids;
262 sub_gid = sub_gid % sub_gids;
263
264 shaders = screen->perfcounters->shader_type_bits[shader_id];
265
266 query_shaders = query->shaders & ~R600_PC_SHADERS_WINDOWING;
267 if (query_shaders && query_shaders != shaders) {
268 fprintf(stderr, "r600_perfcounter: incompatible shader groups\n");
269 FREE(group);
270 return NULL;
271 }
272 query->shaders = shaders;
273 }
274
275 if (block->flags & R600_PC_BLOCK_SHADER_WINDOWED && !query->shaders) {
276 // A non-zero value in query->shaders ensures that the shader
277 // masking is reset unless the user explicitly requests one.
278 query->shaders = R600_PC_SHADERS_WINDOWING;
279 }
280
281 if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
282 group->se = sub_gid / block->num_instances;
283 sub_gid = sub_gid % block->num_instances;
284 } else {
285 group->se = -1;
286 }
287
288 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
289 group->instance = sub_gid;
290 } else {
291 group->instance = -1;
292 }
293
294 group->next = query->groups;
295 query->groups = group;
296
297 return group;
298 }
299
r600_create_batch_query(struct pipe_context * ctx,unsigned num_queries,unsigned * query_types)300 struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
301 unsigned num_queries,
302 unsigned *query_types)
303 {
304 struct r600_common_screen *screen =
305 (struct r600_common_screen *)ctx->screen;
306 struct r600_perfcounters *pc = screen->perfcounters;
307 struct r600_perfcounter_block *block;
308 struct r600_pc_group *group;
309 struct r600_query_pc *query;
310 unsigned base_gid, sub_gid, sub_index;
311 unsigned i, j;
312
313 if (!pc)
314 return NULL;
315
316 query = CALLOC_STRUCT(r600_query_pc);
317 if (!query)
318 return NULL;
319
320 query->b.b.ops = &batch_query_ops;
321 query->b.ops = &batch_query_hw_ops;
322
323 query->num_counters = num_queries;
324
325 /* Collect selectors per group */
326 for (i = 0; i < num_queries; ++i) {
327 unsigned sub_gid;
328
329 if (query_types[i] < R600_QUERY_FIRST_PERFCOUNTER)
330 goto error;
331
332 block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER,
333 &base_gid, &sub_index);
334 if (!block)
335 goto error;
336
337 sub_gid = sub_index / block->num_selectors;
338 sub_index = sub_index % block->num_selectors;
339
340 group = get_group_state(screen, query, block, sub_gid);
341 if (!group)
342 goto error;
343
344 if (group->num_counters >= block->num_counters) {
345 fprintf(stderr,
346 "perfcounter group %s: too many selected\n",
347 block->basename);
348 goto error;
349 }
350 group->selectors[group->num_counters] = sub_index;
351 ++group->num_counters;
352 }
353
354 /* Compute result bases and CS size per group */
355 query->b.num_cs_dw_begin = pc->num_start_cs_dwords;
356 query->b.num_cs_dw_end = pc->num_stop_cs_dwords;
357
358 query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */
359 query->b.num_cs_dw_end += pc->num_instance_cs_dwords;
360
361 i = 0;
362 for (group = query->groups; group; group = group->next) {
363 struct r600_perfcounter_block *block = group->block;
364 unsigned select_dw, read_dw;
365 unsigned instances = 1;
366
367 if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
368 instances = screen->info.max_se;
369 if (group->instance < 0)
370 instances *= block->num_instances;
371
372 group->result_base = i;
373 query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
374 i += instances * group->num_counters;
375
376 pc->get_size(block, group->num_counters, group->selectors,
377 &select_dw, &read_dw);
378 query->b.num_cs_dw_begin += select_dw;
379 query->b.num_cs_dw_end += instances * read_dw;
380 query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */
381 query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords;
382 }
383
384 if (query->shaders) {
385 if (query->shaders == R600_PC_SHADERS_WINDOWING)
386 query->shaders = 0xffffffff;
387 query->b.num_cs_dw_begin += pc->num_shaders_cs_dwords;
388 }
389
390 /* Map user-supplied query array to result indices */
391 query->counters = CALLOC(num_queries, sizeof(*query->counters));
392 for (i = 0; i < num_queries; ++i) {
393 struct r600_pc_counter *counter = &query->counters[i];
394 struct r600_perfcounter_block *block;
395
396 block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER,
397 &base_gid, &sub_index);
398
399 sub_gid = sub_index / block->num_selectors;
400 sub_index = sub_index % block->num_selectors;
401
402 group = get_group_state(screen, query, block, sub_gid);
403 assert(group != NULL);
404
405 for (j = 0; j < group->num_counters; ++j) {
406 if (group->selectors[j] == sub_index)
407 break;
408 }
409
410 counter->base = group->result_base + j;
411 counter->stride = group->num_counters;
412
413 counter->qwords = 1;
414 if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
415 counter->qwords = screen->info.max_se;
416 if (group->instance < 0)
417 counter->qwords *= block->num_instances;
418 }
419
420 if (!r600_query_hw_init(screen, &query->b))
421 goto error;
422
423 return (struct pipe_query *)query;
424
425 error:
426 r600_pc_query_destroy(screen, &query->b.b);
427 return NULL;
428 }
429
r600_init_block_names(struct r600_common_screen * screen,struct r600_perfcounter_block * block)430 static bool r600_init_block_names(struct r600_common_screen *screen,
431 struct r600_perfcounter_block *block)
432 {
433 unsigned i, j, k;
434 unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
435 unsigned namelen;
436 char *groupname;
437 char *p;
438
439 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
440 groups_instance = block->num_instances;
441 if (block->flags & R600_PC_BLOCK_SE_GROUPS)
442 groups_se = screen->info.max_se;
443 if (block->flags & R600_PC_BLOCK_SHADER)
444 groups_shader = screen->perfcounters->num_shader_types;
445
446 namelen = strlen(block->basename);
447 block->group_name_stride = namelen + 1;
448 if (block->flags & R600_PC_BLOCK_SHADER)
449 block->group_name_stride += 3;
450 if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
451 assert(groups_se <= 10);
452 block->group_name_stride += 1;
453
454 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
455 block->group_name_stride += 1;
456 }
457 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
458 assert(groups_instance <= 100);
459 block->group_name_stride += 2;
460 }
461
462 block->group_names = MALLOC(block->num_groups * block->group_name_stride);
463 if (!block->group_names)
464 return false;
465
466 groupname = block->group_names;
467 for (i = 0; i < groups_shader; ++i) {
468 const char *shader_suffix = screen->perfcounters->shader_type_suffixes[i];
469 unsigned shaderlen = strlen(shader_suffix);
470 for (j = 0; j < groups_se; ++j) {
471 for (k = 0; k < groups_instance; ++k) {
472 strcpy(groupname, block->basename);
473 p = groupname + namelen;
474
475 if (block->flags & R600_PC_BLOCK_SHADER) {
476 strcpy(p, shader_suffix);
477 p += shaderlen;
478 }
479
480 if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
481 p += sprintf(p, "%d", j);
482 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
483 *p++ = '_';
484 }
485
486 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
487 p += sprintf(p, "%d", k);
488
489 groupname += block->group_name_stride;
490 }
491 }
492 }
493
494 assert(block->num_selectors <= 1000);
495 block->selector_name_stride = block->group_name_stride + 4;
496 block->selector_names = MALLOC(block->num_groups * block->num_selectors *
497 block->selector_name_stride);
498 if (!block->selector_names)
499 return false;
500
501 groupname = block->group_names;
502 p = block->selector_names;
503 for (i = 0; i < block->num_groups; ++i) {
504 for (j = 0; j < block->num_selectors; ++j) {
505 sprintf(p, "%s_%03d", groupname, j);
506 p += block->selector_name_stride;
507 }
508 groupname += block->group_name_stride;
509 }
510
511 return true;
512 }
513
r600_get_perfcounter_info(struct r600_common_screen * screen,unsigned index,struct pipe_driver_query_info * info)514 int r600_get_perfcounter_info(struct r600_common_screen *screen,
515 unsigned index,
516 struct pipe_driver_query_info *info)
517 {
518 struct r600_perfcounters *pc = screen->perfcounters;
519 struct r600_perfcounter_block *block;
520 unsigned base_gid, sub;
521
522 if (!pc)
523 return 0;
524
525 if (!info) {
526 unsigned bid, num_queries = 0;
527
528 for (bid = 0; bid < pc->num_blocks; ++bid) {
529 num_queries += pc->blocks[bid].num_selectors *
530 pc->blocks[bid].num_groups;
531 }
532
533 return num_queries;
534 }
535
536 block = lookup_counter(pc, index, &base_gid, &sub);
537 if (!block)
538 return 0;
539
540 if (!block->selector_names) {
541 if (!r600_init_block_names(screen, block))
542 return 0;
543 }
544 info->name = block->selector_names + sub * block->selector_name_stride;
545 info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index;
546 info->max_value.u64 = 0;
547 info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
548 info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
549 info->group_id = base_gid + sub / block->num_selectors;
550 info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
551 if (sub > 0 && sub + 1 < block->num_selectors * block->num_groups)
552 info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST;
553 return 1;
554 }
555
r600_get_perfcounter_group_info(struct r600_common_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)556 int r600_get_perfcounter_group_info(struct r600_common_screen *screen,
557 unsigned index,
558 struct pipe_driver_query_group_info *info)
559 {
560 struct r600_perfcounters *pc = screen->perfcounters;
561 struct r600_perfcounter_block *block;
562
563 if (!pc)
564 return 0;
565
566 if (!info)
567 return pc->num_groups;
568
569 block = lookup_group(pc, &index);
570 if (!block)
571 return 0;
572
573 if (!block->group_names) {
574 if (!r600_init_block_names(screen, block))
575 return 0;
576 }
577 info->name = block->group_names + index * block->group_name_stride;
578 info->num_queries = block->num_selectors;
579 info->max_active_queries = block->num_counters;
580 return 1;
581 }
582
r600_perfcounters_destroy(struct r600_common_screen * rscreen)583 void r600_perfcounters_destroy(struct r600_common_screen *rscreen)
584 {
585 if (rscreen->perfcounters)
586 rscreen->perfcounters->cleanup(rscreen);
587 }
588
r600_perfcounters_init(struct r600_perfcounters * pc,unsigned num_blocks)589 bool r600_perfcounters_init(struct r600_perfcounters *pc,
590 unsigned num_blocks)
591 {
592 pc->blocks = CALLOC(num_blocks, sizeof(struct r600_perfcounter_block));
593 if (!pc->blocks)
594 return false;
595
596 pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
597 pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
598
599 return true;
600 }
601
r600_perfcounters_add_block(struct r600_common_screen * rscreen,struct r600_perfcounters * pc,const char * name,unsigned flags,unsigned counters,unsigned selectors,unsigned instances,void * data)602 void r600_perfcounters_add_block(struct r600_common_screen *rscreen,
603 struct r600_perfcounters *pc,
604 const char *name, unsigned flags,
605 unsigned counters, unsigned selectors,
606 unsigned instances, void *data)
607 {
608 struct r600_perfcounter_block *block = &pc->blocks[pc->num_blocks];
609
610 assert(counters <= R600_QUERY_MAX_COUNTERS);
611
612 block->basename = name;
613 block->flags = flags;
614 block->num_counters = counters;
615 block->num_selectors = selectors;
616 block->num_instances = MAX2(instances, 1);
617 block->data = data;
618
619 if (pc->separate_se && (block->flags & R600_PC_BLOCK_SE))
620 block->flags |= R600_PC_BLOCK_SE_GROUPS;
621 if (pc->separate_instance && block->num_instances > 1)
622 block->flags |= R600_PC_BLOCK_INSTANCE_GROUPS;
623
624 if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
625 block->num_groups = block->num_instances;
626 } else {
627 block->num_groups = 1;
628 }
629
630 if (block->flags & R600_PC_BLOCK_SE_GROUPS)
631 block->num_groups *= rscreen->info.max_se;
632 if (block->flags & R600_PC_BLOCK_SHADER)
633 block->num_groups *= pc->num_shader_types;
634
635 ++pc->num_blocks;
636 pc->num_groups += block->num_groups;
637 }
638
r600_perfcounters_do_destroy(struct r600_perfcounters * pc)639 void r600_perfcounters_do_destroy(struct r600_perfcounters *pc)
640 {
641 unsigned i;
642
643 for (i = 0; i < pc->num_blocks; ++i) {
644 FREE(pc->blocks[i].group_names);
645 FREE(pc->blocks[i].selector_names);
646 }
647 FREE(pc->blocks);
648 FREE(pc);
649 }
650