1 /*
2 * Copyright 2015 Samuel Pitoiset
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "nvc0/nvc0_context.h"
24 #include "nvc0/nvc0_query_hw_metric.h"
25 #include "nvc0/nvc0_query_hw_sm.h"
26
27 #define _Q(i,n,t,d) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t, d }
28 static const struct nvc0_hw_metric_cfg {
29 unsigned id;
30 const char *name;
31 enum pipe_driver_query_type type;
32 const char *desc;
33 } nvc0_hw_metric_queries[] = {
34 _Q(ACHIEVED_OCCUPANCY,
35 "metric-achieved_occupancy",
36 PERCENTAGE,
37 "Ratio of the average active warps per active cycle to the maximum "
38 "number of warps supported on a multiprocessor"),
39
40 _Q(BRANCH_EFFICIENCY,
41 "metric-branch_efficiency",
42 PERCENTAGE,
43 "Ratio of non-divergent branches to total branches"),
44
45 _Q(INST_ISSUED,
46 "metric-inst_issued",
47 UINT64,
48 "The number of instructions issued"),
49
50 _Q(INST_PER_WRAP,
51 "metric-inst_per_wrap",
52 UINT64,
53 "Average number of instructions executed by each warp"),
54
55 _Q(INST_REPLAY_OVERHEAD,
56 "metric-inst_replay_overhead",
57 UINT64,
58 "Average number of replays for each instruction executed"),
59
60 _Q(ISSUED_IPC,
61 "metric-issued_ipc",
62 UINT64,
63 "Instructions issued per cycle"),
64
65 _Q(ISSUE_SLOTS,
66 "metric-issue_slots",
67 UINT64,
68 "The number of issue slots used"),
69
70 _Q(ISSUE_SLOT_UTILIZATION,
71 "metric-issue_slot_utilization",
72 PERCENTAGE,
73 "Percentage of issue slots that issued at least one instruction, "
74 "averaged across all cycles"),
75
76 _Q(IPC,
77 "metric-ipc",
78 UINT64,
79 "Instructions executed per cycle"),
80
81 _Q(SHARED_REPLAY_OVERHEAD,
82 "metric-shared_replay_overhead",
83 UINT64,
84 "Average number of replays due to shared memory conflicts for each "
85 "instruction executed"),
86
87 _Q(WARP_EXECUTION_EFFICIENCY,
88 "metric-warp_execution_efficiency",
89 PERCENTAGE,
90 "Ratio of the average active threads per warp to the maximum number of "
91 "threads per warp supported on a multiprocessor"),
92
93 _Q(WARP_NONPRED_EXECUTION_EFFICIENCY,
94 "metric-warp_nonpred_execution_efficiency",
95 PERCENTAGE,
96 "Ratio of the average active threads per warp executing non-predicated "
97 "instructions to the maximum number of threads per warp supported on a "
98 "multiprocessor"),
99 };
100
101 #undef _Q
102
103 static inline const struct nvc0_hw_metric_cfg *
nvc0_hw_metric_get_cfg(unsigned metric_id)104 nvc0_hw_metric_get_cfg(unsigned metric_id)
105 {
106 unsigned i;
107
108 for (i = 0; i < ARRAY_SIZE(nvc0_hw_metric_queries); i++) {
109 if (nvc0_hw_metric_queries[i].id == metric_id)
110 return &nvc0_hw_metric_queries[i];
111 }
112 assert(0);
113 return NULL;
114 }
115
116 struct nvc0_hw_metric_query_cfg {
117 unsigned type;
118 uint32_t queries[8];
119 uint32_t num_queries;
120 };
121
122 #define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n)
123
124 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
125 static const struct nvc0_hw_metric_query_cfg
126 sm20_achieved_occupancy =
127 {
128 .type = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY,
129 .queries[0] = _SM(ACTIVE_WARPS),
130 .queries[1] = _SM(ACTIVE_CYCLES),
131 .num_queries = 2,
132 };
133
134 static const struct nvc0_hw_metric_query_cfg
135 sm20_branch_efficiency =
136 {
137 .type = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
138 .queries[0] = _SM(BRANCH),
139 .queries[1] = _SM(DIVERGENT_BRANCH),
140 .num_queries = 2,
141 };
142
143 static const struct nvc0_hw_metric_query_cfg
144 sm20_inst_per_wrap =
145 {
146 .type = NVC0_HW_METRIC_QUERY_INST_PER_WRAP,
147 .queries[0] = _SM(INST_EXECUTED),
148 .queries[1] = _SM(WARPS_LAUNCHED),
149 .num_queries = 2,
150 };
151
152 static const struct nvc0_hw_metric_query_cfg
153 sm20_inst_replay_overhead =
154 {
155 .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
156 .queries[0] = _SM(INST_ISSUED),
157 .queries[1] = _SM(INST_EXECUTED),
158 .num_queries = 2,
159 };
160
161 static const struct nvc0_hw_metric_query_cfg
162 sm20_issued_ipc =
163 {
164 .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
165 .queries[0] = _SM(INST_ISSUED),
166 .queries[1] = _SM(ACTIVE_CYCLES),
167 .num_queries = 2,
168 };
169
170 static const struct nvc0_hw_metric_query_cfg
171 sm20_issue_slot_utilization =
172 {
173 .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
174 .queries[0] = _SM(INST_ISSUED),
175 .queries[1] = _SM(ACTIVE_CYCLES),
176 .num_queries = 2,
177 };
178
179 static const struct nvc0_hw_metric_query_cfg
180 sm20_ipc =
181 {
182 .type = NVC0_HW_METRIC_QUERY_IPC,
183 .queries[0] = _SM(INST_EXECUTED),
184 .queries[1] = _SM(ACTIVE_CYCLES),
185 .num_queries = 2,
186 };
187
188 static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] =
189 {
190 &sm20_achieved_occupancy,
191 &sm20_branch_efficiency,
192 &sm20_inst_per_wrap,
193 &sm20_inst_replay_overhead,
194 &sm20_ipc,
195 &sm20_issued_ipc,
196 &sm20_issue_slot_utilization,
197 };
198
199 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
200 static const struct nvc0_hw_metric_query_cfg
201 sm21_inst_issued =
202 {
203 .type = NVC0_HW_METRIC_QUERY_INST_ISSUED,
204 .queries[0] = _SM(INST_ISSUED1_0),
205 .queries[1] = _SM(INST_ISSUED1_1),
206 .queries[2] = _SM(INST_ISSUED2_0),
207 .queries[3] = _SM(INST_ISSUED2_1),
208 .num_queries = 4,
209 };
210
211 static const struct nvc0_hw_metric_query_cfg
212 sm21_inst_replay_overhead =
213 {
214 .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
215 .queries[0] = _SM(INST_ISSUED1_0),
216 .queries[1] = _SM(INST_ISSUED1_1),
217 .queries[2] = _SM(INST_ISSUED2_0),
218 .queries[3] = _SM(INST_ISSUED2_1),
219 .queries[4] = _SM(INST_EXECUTED),
220 .num_queries = 5,
221 };
222
223 static const struct nvc0_hw_metric_query_cfg
224 sm21_issued_ipc =
225 {
226 .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
227 .queries[0] = _SM(INST_ISSUED1_0),
228 .queries[1] = _SM(INST_ISSUED1_1),
229 .queries[2] = _SM(INST_ISSUED2_0),
230 .queries[3] = _SM(INST_ISSUED2_1),
231 .queries[4] = _SM(ACTIVE_CYCLES),
232 .num_queries = 5,
233 };
234
235 static const struct nvc0_hw_metric_query_cfg
236 sm21_issue_slots =
237 {
238 .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
239 .queries[0] = _SM(INST_ISSUED1_0),
240 .queries[1] = _SM(INST_ISSUED1_1),
241 .queries[2] = _SM(INST_ISSUED2_0),
242 .queries[3] = _SM(INST_ISSUED2_1),
243 .num_queries = 4,
244 };
245
246 static const struct nvc0_hw_metric_query_cfg
247 sm21_issue_slot_utilization =
248 {
249 .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
250 .queries[0] = _SM(INST_ISSUED1_0),
251 .queries[1] = _SM(INST_ISSUED1_1),
252 .queries[2] = _SM(INST_ISSUED2_0),
253 .queries[3] = _SM(INST_ISSUED2_1),
254 .queries[4] = _SM(ACTIVE_CYCLES),
255 .num_queries = 5,
256 };
257
258 static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] =
259 {
260 &sm20_achieved_occupancy,
261 &sm20_branch_efficiency,
262 &sm21_inst_issued,
263 &sm20_inst_per_wrap,
264 &sm21_inst_replay_overhead,
265 &sm20_ipc,
266 &sm21_issued_ipc,
267 &sm21_issue_slots,
268 &sm21_issue_slot_utilization,
269 };
270
271 /* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */
272 static const struct nvc0_hw_metric_query_cfg
273 sm30_inst_issued =
274 {
275 .type = NVC0_HW_METRIC_QUERY_INST_ISSUED,
276 .queries[0] = _SM(INST_ISSUED1),
277 .queries[1] = _SM(INST_ISSUED2),
278 .num_queries = 2,
279 };
280
281 static const struct nvc0_hw_metric_query_cfg
282 sm30_inst_replay_overhead =
283 {
284 .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
285 .queries[0] = _SM(INST_ISSUED1),
286 .queries[1] = _SM(INST_ISSUED2),
287 .queries[2] = _SM(INST_EXECUTED),
288 .num_queries = 3,
289 };
290
291 static const struct nvc0_hw_metric_query_cfg
292 sm30_issued_ipc =
293 {
294 .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
295 .queries[0] = _SM(INST_ISSUED1),
296 .queries[1] = _SM(INST_ISSUED2),
297 .queries[2] = _SM(ACTIVE_CYCLES),
298 .num_queries = 3,
299 };
300
301 static const struct nvc0_hw_metric_query_cfg
302 sm30_issue_slots =
303 {
304 .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
305 .queries[0] = _SM(INST_ISSUED1),
306 .queries[1] = _SM(INST_ISSUED2),
307 .num_queries = 2,
308 };
309
310 static const struct nvc0_hw_metric_query_cfg
311 sm30_issue_slot_utilization =
312 {
313 .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
314 .queries[0] = _SM(INST_ISSUED1),
315 .queries[1] = _SM(INST_ISSUED2),
316 .queries[2] = _SM(ACTIVE_CYCLES),
317 .num_queries = 3,
318 };
319
320 static const struct nvc0_hw_metric_query_cfg
321 sm30_shared_replay_overhead =
322 {
323 .type = NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD,
324 .queries[0] = _SM(SHARED_LD_REPLAY),
325 .queries[1] = _SM(SHARED_ST_REPLAY),
326 .queries[2] = _SM(INST_EXECUTED),
327 .num_queries = 3,
328 };
329
330 static const struct nvc0_hw_metric_query_cfg
331 sm30_warp_execution_efficiency =
332 {
333 .type = NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY,
334 .queries[0] = _SM(INST_EXECUTED),
335 .queries[1] = _SM(TH_INST_EXECUTED),
336 .num_queries = 2,
337 };
338
339 static const struct nvc0_hw_metric_query_cfg *sm30_hw_metric_queries[] =
340 {
341 &sm20_achieved_occupancy,
342 &sm20_branch_efficiency,
343 &sm30_inst_issued,
344 &sm20_inst_per_wrap,
345 &sm30_inst_replay_overhead,
346 &sm20_ipc,
347 &sm30_issued_ipc,
348 &sm30_issue_slots,
349 &sm30_issue_slot_utilization,
350 &sm30_shared_replay_overhead,
351 &sm30_warp_execution_efficiency,
352 };
353
354 /* ==== Compute capability 3.5 (GK110/GK208) ==== */
355 static const struct nvc0_hw_metric_query_cfg
356 sm35_warp_nonpred_execution_efficiency =
357 {
358 .type = NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY,
359 .queries[0] = _SM(INST_EXECUTED),
360 .queries[1] = _SM(NOT_PRED_OFF_INST_EXECUTED),
361 .num_queries = 2,
362 };
363
364 static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] =
365 {
366 &sm20_achieved_occupancy,
367 &sm30_inst_issued,
368 &sm20_inst_per_wrap,
369 &sm30_inst_replay_overhead,
370 &sm20_ipc,
371 &sm30_issued_ipc,
372 &sm30_issue_slots,
373 &sm30_issue_slot_utilization,
374 &sm30_shared_replay_overhead,
375 &sm30_warp_execution_efficiency,
376 &sm35_warp_nonpred_execution_efficiency,
377 };
378
379 /* ==== Compute capability 5.0 (GM107/GM108) ==== */
380 static const struct nvc0_hw_metric_query_cfg *sm50_hw_metric_queries[] =
381 {
382 &sm20_achieved_occupancy,
383 &sm20_branch_efficiency,
384 &sm30_inst_issued,
385 &sm20_inst_per_wrap,
386 &sm30_inst_replay_overhead,
387 &sm20_ipc,
388 &sm30_issued_ipc,
389 &sm30_issue_slots,
390 &sm30_issue_slot_utilization,
391 &sm30_warp_execution_efficiency,
392 &sm35_warp_nonpred_execution_efficiency,
393 };
394
395 #undef _SM
396
397 static inline const struct nvc0_hw_metric_query_cfg **
nvc0_hw_metric_get_queries(struct nvc0_screen * screen)398 nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
399 {
400 struct nouveau_device *dev = screen->base.device;
401
402 switch (screen->base.class_3d) {
403 case GM200_3D_CLASS:
404 case GM107_3D_CLASS:
405 return sm50_hw_metric_queries;
406 case NVF0_3D_CLASS:
407 return sm35_hw_metric_queries;
408 case NVE4_3D_CLASS:
409 return sm30_hw_metric_queries;
410 case NVC0_3D_CLASS:
411 case NVC1_3D_CLASS:
412 case NVC8_3D_CLASS:
413 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
414 return sm20_hw_metric_queries;
415 return sm21_hw_metric_queries;
416 }
417 assert(0);
418 return NULL;
419 }
420
421 unsigned
nvc0_hw_metric_get_num_queries(struct nvc0_screen * screen)422 nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen)
423 {
424 struct nouveau_device *dev = screen->base.device;
425
426 switch (screen->base.class_3d) {
427 case GM200_3D_CLASS:
428 case GM107_3D_CLASS:
429 return ARRAY_SIZE(sm50_hw_metric_queries);
430 case NVF0_3D_CLASS:
431 return ARRAY_SIZE(sm35_hw_metric_queries);
432 case NVE4_3D_CLASS:
433 return ARRAY_SIZE(sm30_hw_metric_queries);
434 case NVC0_3D_CLASS:
435 case NVC1_3D_CLASS:
436 case NVC8_3D_CLASS:
437 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
438 return ARRAY_SIZE(sm20_hw_metric_queries);
439 return ARRAY_SIZE(sm21_hw_metric_queries);
440 }
441 return 0;
442 }
443
444 static const struct nvc0_hw_metric_query_cfg *
nvc0_hw_metric_query_get_cfg(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)445 nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
446 {
447 const struct nvc0_hw_metric_query_cfg **queries;
448 struct nvc0_screen *screen = nvc0->screen;
449 struct nvc0_query *q = &hq->base;
450 unsigned num_queries;
451 unsigned i;
452
453 num_queries = nvc0_hw_metric_get_num_queries(screen);
454 queries = nvc0_hw_metric_get_queries(screen);
455
456 for (i = 0; i < num_queries; i++) {
457 if (NVC0_HW_METRIC_QUERY(queries[i]->type) == q->type)
458 return queries[i];
459 }
460 assert(0);
461 return NULL;
462 }
463
464 static void
nvc0_hw_metric_destroy_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)465 nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
466 struct nvc0_hw_query *hq)
467 {
468 struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
469 unsigned i;
470
471 for (i = 0; i < hmq->num_queries; i++)
472 if (hmq->queries[i]->funcs->destroy_query)
473 hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
474 FREE(hmq);
475 }
476
477 static bool
nvc0_hw_metric_begin_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)478 nvc0_hw_metric_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
479 {
480 struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
481 bool ret = false;
482 unsigned i;
483
484 for (i = 0; i < hmq->num_queries; i++) {
485 ret = hmq->queries[i]->funcs->begin_query(nvc0, hmq->queries[i]);
486 if (!ret)
487 return ret;
488 }
489 return ret;
490 }
491
492 static void
nvc0_hw_metric_end_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)493 nvc0_hw_metric_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
494 {
495 struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
496 unsigned i;
497
498 for (i = 0; i < hmq->num_queries; i++)
499 hmq->queries[i]->funcs->end_query(nvc0, hmq->queries[i]);
500 }
501
502 static uint64_t
sm20_hw_metric_calc_result(struct nvc0_hw_query * hq,uint64_t res64[8])503 sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
504 {
505 switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
506 case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
507 /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
508 if (res64[1])
509 return ((res64[0] / (double)res64[1]) / 48) * 100;
510 break;
511 case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
512 /* (branch / (branch + divergent_branch)) * 100 */
513 if (res64[0] + res64[1])
514 return (res64[0] / (double)(res64[0] + res64[1])) * 100;
515 break;
516 case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
517 /* inst_executed / warps_launched */
518 if (res64[1])
519 return res64[0] / (double)res64[1];
520 break;
521 case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
522 /* (inst_issued - inst_executed) / inst_executed */
523 if (res64[1])
524 return (res64[0] - res64[1]) / (double)res64[1];
525 break;
526 case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
527 /* inst_issued / active_cycles */
528 if (res64[1])
529 return res64[0] / (double)res64[1];
530 break;
531 case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
532 /* ((inst_issued / 2) / active_cycles) * 100 */
533 if (res64[1])
534 return ((res64[0] / 2) / (double)res64[1]) * 100;
535 break;
536 case NVC0_HW_METRIC_QUERY_IPC:
537 /* inst_executed / active_cycles */
538 if (res64[1])
539 return res64[0] / (double)res64[1];
540 break;
541 default:
542 debug_printf("invalid metric type: %d\n",
543 hq->base.type - NVC0_HW_METRIC_QUERY(0));
544 break;
545 }
546 return 0;
547 }
548
549 static uint64_t
sm21_hw_metric_calc_result(struct nvc0_hw_query * hq,uint64_t res64[8])550 sm21_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
551 {
552 switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
553 case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
554 return sm20_hw_metric_calc_result(hq, res64);
555 case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
556 return sm20_hw_metric_calc_result(hq, res64);
557 case NVC0_HW_METRIC_QUERY_INST_ISSUED:
558 /* issued1_0 + issued1_1 + (issued2_0 + issued2_1) * 2 */
559 return res64[0] + res64[1] + (res64[2] + res64[3]) * 2;
560 break;
561 case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
562 return sm20_hw_metric_calc_result(hq, res64);
563 case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
564 /* (metric-inst_issued - inst_executed) / inst_executed */
565 if (res64[4])
566 return (((res64[0] + res64[1] + (res64[2] + res64[3]) * 2) -
567 res64[4]) / (double)res64[4]);
568 break;
569 case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
570 /* metric-inst_issued / active_cycles */
571 if (res64[4])
572 return (res64[0] + res64[1] + (res64[2] + res64[3]) * 2) /
573 (double)res64[4];
574 break;
575 case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS:
576 /* issued1_0 + issued1_1 + issued2_0 + issued2_1 */
577 return res64[0] + res64[1] + res64[2] + res64[3];
578 break;
579 case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
580 /* ((metric-issue_slots / 2) / active_cycles) * 100 */
581 if (res64[4])
582 return (((res64[0] + res64[1] + res64[2] + res64[3]) / 2) /
583 (double)res64[4]) * 100;
584 break;
585 case NVC0_HW_METRIC_QUERY_IPC:
586 return sm20_hw_metric_calc_result(hq, res64);
587 default:
588 debug_printf("invalid metric type: %d\n",
589 hq->base.type - NVC0_HW_METRIC_QUERY(0));
590 break;
591 }
592 return 0;
593 }
594
595 static uint64_t
sm30_hw_metric_calc_result(struct nvc0_hw_query * hq,uint64_t res64[8])596 sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
597 {
598 switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
599 case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
600 /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
601 if (res64[1])
602 return ((res64[0] / (double)res64[1]) / 64) * 100;
603 break;
604 case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
605 return sm20_hw_metric_calc_result(hq, res64);
606 case NVC0_HW_METRIC_QUERY_INST_ISSUED:
607 /* inst_issued1 + inst_issued2 * 2 */
608 return res64[0] + res64[1] * 2;
609 case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
610 return sm20_hw_metric_calc_result(hq, res64);
611 case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
612 /* (metric-inst_issued - inst_executed) / inst_executed */
613 if (res64[2])
614 return (((res64[0] + res64[1] * 2) - res64[2]) / (double)res64[2]);
615 break;
616 case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
617 /* metric-inst_issued / active_cycles */
618 if (res64[2])
619 return (res64[0] + res64[1] * 2) / (double)res64[2];
620 break;
621 case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS:
622 /* inst_issued1 + inst_issued2 */
623 return res64[0] + res64[1];
624 case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
625 /* ((metric-issue_slots / 2) / active_cycles) * 100 */
626 if (res64[2])
627 return (((res64[0] + res64[1]) / 2) / (double)res64[2]) * 100;
628 break;
629 case NVC0_HW_METRIC_QUERY_IPC:
630 return sm20_hw_metric_calc_result(hq, res64);
631 case NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD:
632 /* (shared_load_replay + shared_store_replay) / inst_executed */
633 if (res64[2])
634 return (res64[0] + res64[1]) / (double)res64[2];
635 break;
636 case NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY:
637 /* thread_inst_executed / (inst_executed * max. number of threads per
638 * wrap) * 100 */
639 if (res64[0])
640 return (res64[1] / ((double)res64[0] * 32)) * 100;
641 break;
642 default:
643 debug_printf("invalid metric type: %d\n",
644 hq->base.type - NVC0_HW_METRIC_QUERY(0));
645 break;
646 }
647 return 0;
648 }
649
650 static uint64_t
sm35_hw_metric_calc_result(struct nvc0_hw_query * hq,uint64_t res64[8])651 sm35_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
652 {
653 switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
654 case NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY:
655 /* not_predicated_off_thread_inst_executed / (inst_executed * max. number
656 * of threads per wrap) * 100 */
657 if (res64[0])
658 return (res64[1] / ((double)res64[0] * 32)) * 100;
659 break;
660 default:
661 return sm30_hw_metric_calc_result(hq, res64);
662 }
663 return 0;
664 }
665
666 static bool
nvc0_hw_metric_get_query_result(struct nvc0_context * nvc0,struct nvc0_hw_query * hq,bool wait,union pipe_query_result * result)667 nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
668 struct nvc0_hw_query *hq, bool wait,
669 union pipe_query_result *result)
670 {
671 struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
672 struct nvc0_screen *screen = nvc0->screen;
673 struct nouveau_device *dev = screen->base.device;
674 union pipe_query_result results[8] = {};
675 uint64_t res64[8] = {};
676 uint64_t value = 0;
677 bool ret = false;
678 unsigned i;
679
680 for (i = 0; i < hmq->num_queries; i++) {
681 ret = hmq->queries[i]->funcs->get_query_result(nvc0, hmq->queries[i],
682 wait, &results[i]);
683 if (!ret)
684 return ret;
685 res64[i] = *(uint64_t *)&results[i];
686 }
687
688 switch (screen->base.class_3d) {
689 case GM200_3D_CLASS:
690 case GM107_3D_CLASS:
691 case NVF0_3D_CLASS:
692 value = sm35_hw_metric_calc_result(hq, res64);
693 break;
694 case NVE4_3D_CLASS:
695 value = sm30_hw_metric_calc_result(hq, res64);
696 break;
697 default:
698 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
699 value = sm20_hw_metric_calc_result(hq, res64);
700 else
701 value = sm21_hw_metric_calc_result(hq, res64);
702 break;
703 }
704
705 *(uint64_t *)result = value;
706 return ret;
707 }
708
709 static const struct nvc0_hw_query_funcs hw_metric_query_funcs = {
710 .destroy_query = nvc0_hw_metric_destroy_query,
711 .begin_query = nvc0_hw_metric_begin_query,
712 .end_query = nvc0_hw_metric_end_query,
713 .get_query_result = nvc0_hw_metric_get_query_result,
714 };
715
716 struct nvc0_hw_query *
nvc0_hw_metric_create_query(struct nvc0_context * nvc0,unsigned type)717 nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type)
718 {
719 const struct nvc0_hw_metric_query_cfg *cfg;
720 struct nvc0_hw_metric_query *hmq;
721 struct nvc0_hw_query *hq;
722 unsigned i;
723
724 if (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST)
725 return NULL;
726
727 hmq = CALLOC_STRUCT(nvc0_hw_metric_query);
728 if (!hmq)
729 return NULL;
730
731 hq = &hmq->base;
732 hq->funcs = &hw_metric_query_funcs;
733 hq->base.type = type;
734
735 cfg = nvc0_hw_metric_query_get_cfg(nvc0, hq);
736
737 for (i = 0; i < cfg->num_queries; i++) {
738 hmq->queries[i] = nvc0_hw_sm_create_query(nvc0, cfg->queries[i]);
739 if (!hmq->queries[i]) {
740 nvc0_hw_metric_destroy_query(nvc0, hq);
741 return NULL;
742 }
743 hmq->num_queries++;
744 }
745
746 return hq;
747 }
748
749 int
nvc0_hw_metric_get_driver_query_info(struct nvc0_screen * screen,unsigned id,struct pipe_driver_query_info * info)750 nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
751 struct pipe_driver_query_info *info)
752 {
753 int count = 0;
754
755 if (screen->base.drm->version >= 0x01000101) {
756 if (screen->compute)
757 count = nvc0_hw_metric_get_num_queries(screen);
758 }
759
760 if (!info)
761 return count;
762
763 if (id < count) {
764 if (screen->compute) {
765 if (screen->base.class_3d <= GM200_3D_CLASS) {
766 const struct nvc0_hw_metric_query_cfg **queries =
767 nvc0_hw_metric_get_queries(screen);
768 const struct nvc0_hw_metric_cfg *cfg =
769 nvc0_hw_metric_get_cfg(queries[id]->type);
770
771 info->name = cfg->name;
772 info->query_type = NVC0_HW_METRIC_QUERY(queries[id]->type);
773 info->type = cfg->type;
774 info->group_id = NVC0_HW_METRIC_QUERY_GROUP;
775 return 1;
776 }
777 }
778 }
779 return 0;
780 }
781