• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2015 Samuel Pitoiset
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "nvc0/nvc0_context.h"
24 #include "nvc0/nvc0_query_hw_metric.h"
25 #include "nvc0/nvc0_query_hw_sm.h"
26 
27 #define _Q(i,n,t,d) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t, d }
28 static const struct nvc0_hw_metric_cfg {
29    unsigned id;
30    const char *name;
31    enum pipe_driver_query_type type;
32    const char *desc;
33 } nvc0_hw_metric_queries[] = {
34    _Q(ACHIEVED_OCCUPANCY,
35       "metric-achieved_occupancy",
36       PERCENTAGE,
37       "Ratio of the average active warps per active cycle to the maximum "
38       "number of warps supported on a multiprocessor"),
39 
40    _Q(BRANCH_EFFICIENCY,
41       "metric-branch_efficiency",
42       PERCENTAGE,
43       "Ratio of non-divergent branches to total branches"),
44 
45    _Q(INST_ISSUED,
46       "metric-inst_issued",
47       UINT64,
48       "The number of instructions issued"),
49 
50    _Q(INST_PER_WRAP,
51       "metric-inst_per_wrap",
52       UINT64,
53       "Average number of instructions executed by each warp"),
54 
55    _Q(INST_REPLAY_OVERHEAD,
56       "metric-inst_replay_overhead",
57       UINT64,
58       "Average number of replays for each instruction executed"),
59 
60    _Q(ISSUED_IPC,
61       "metric-issued_ipc",
62       UINT64,
63       "Instructions issued per cycle"),
64 
65    _Q(ISSUE_SLOTS,
66       "metric-issue_slots",
67       UINT64,
68       "The number of issue slots used"),
69 
70    _Q(ISSUE_SLOT_UTILIZATION,
71       "metric-issue_slot_utilization",
72       PERCENTAGE,
73       "Percentage of issue slots that issued at least one instruction, "
74       "averaged across all cycles"),
75 
76    _Q(IPC,
77       "metric-ipc",
78       UINT64,
79       "Instructions executed per cycle"),
80 
81    _Q(SHARED_REPLAY_OVERHEAD,
82       "metric-shared_replay_overhead",
83       UINT64,
84       "Average number of replays due to shared memory conflicts for each "
85       "instruction executed"),
86 
87    _Q(WARP_EXECUTION_EFFICIENCY,
88       "metric-warp_execution_efficiency",
89       PERCENTAGE,
90       "Ratio of the average active threads per warp to the maximum number of "
91       "threads per warp supported on a multiprocessor"),
92 
93    _Q(WARP_NONPRED_EXECUTION_EFFICIENCY,
94       "metric-warp_nonpred_execution_efficiency",
95       PERCENTAGE,
96       "Ratio of the average active threads per warp executing non-predicated "
97       "instructions to the maximum number of threads per warp supported on a "
98       "multiprocessor"),
99 };
100 
101 #undef _Q
102 
103 static inline const struct nvc0_hw_metric_cfg *
nvc0_hw_metric_get_cfg(unsigned metric_id)104 nvc0_hw_metric_get_cfg(unsigned metric_id)
105 {
106    unsigned i;
107 
108    for (i = 0; i < ARRAY_SIZE(nvc0_hw_metric_queries); i++) {
109       if (nvc0_hw_metric_queries[i].id == metric_id)
110          return &nvc0_hw_metric_queries[i];
111    }
112    assert(0);
113    return NULL;
114 }
115 
116 struct nvc0_hw_metric_query_cfg {
117    unsigned type;
118    uint32_t queries[8];
119    uint32_t num_queries;
120 };
121 
122 #define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n)
123 
124 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
125 static const struct nvc0_hw_metric_query_cfg
126 sm20_achieved_occupancy =
127 {
128    .type        = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY,
129    .queries[0]  = _SM(ACTIVE_WARPS),
130    .queries[1]  = _SM(ACTIVE_CYCLES),
131    .num_queries = 2,
132 };
133 
134 static const struct nvc0_hw_metric_query_cfg
135 sm20_branch_efficiency =
136 {
137    .type        = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
138    .queries[0]  = _SM(BRANCH),
139    .queries[1]  = _SM(DIVERGENT_BRANCH),
140    .num_queries = 2,
141 };
142 
143 static const struct nvc0_hw_metric_query_cfg
144 sm20_inst_per_wrap =
145 {
146    .type        = NVC0_HW_METRIC_QUERY_INST_PER_WRAP,
147    .queries[0]  = _SM(INST_EXECUTED),
148    .queries[1]  = _SM(WARPS_LAUNCHED),
149    .num_queries = 2,
150 };
151 
152 static const struct nvc0_hw_metric_query_cfg
153 sm20_inst_replay_overhead =
154 {
155    .type        = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
156    .queries[0]  = _SM(INST_ISSUED),
157    .queries[1]  = _SM(INST_EXECUTED),
158    .num_queries = 2,
159 };
160 
161 static const struct nvc0_hw_metric_query_cfg
162 sm20_issued_ipc =
163 {
164    .type        = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
165    .queries[0]  = _SM(INST_ISSUED),
166    .queries[1]  = _SM(ACTIVE_CYCLES),
167    .num_queries = 2,
168 };
169 
170 static const struct nvc0_hw_metric_query_cfg
171 sm20_issue_slot_utilization =
172 {
173    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
174    .queries[0]  = _SM(INST_ISSUED),
175    .queries[1]  = _SM(ACTIVE_CYCLES),
176    .num_queries = 2,
177 };
178 
179 static const struct nvc0_hw_metric_query_cfg
180 sm20_ipc =
181 {
182    .type        = NVC0_HW_METRIC_QUERY_IPC,
183    .queries[0]  = _SM(INST_EXECUTED),
184    .queries[1]  = _SM(ACTIVE_CYCLES),
185    .num_queries = 2,
186 };
187 
188 static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] =
189 {
190    &sm20_achieved_occupancy,
191    &sm20_branch_efficiency,
192    &sm20_inst_per_wrap,
193    &sm20_inst_replay_overhead,
194    &sm20_ipc,
195    &sm20_issued_ipc,
196    &sm20_issue_slot_utilization,
197 };
198 
199 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
200 static const struct nvc0_hw_metric_query_cfg
201 sm21_inst_issued =
202 {
203    .type        = NVC0_HW_METRIC_QUERY_INST_ISSUED,
204    .queries[0]  = _SM(INST_ISSUED1_0),
205    .queries[1]  = _SM(INST_ISSUED1_1),
206    .queries[2]  = _SM(INST_ISSUED2_0),
207    .queries[3]  = _SM(INST_ISSUED2_1),
208    .num_queries = 4,
209 };
210 
211 static const struct nvc0_hw_metric_query_cfg
212 sm21_inst_replay_overhead =
213 {
214    .type        = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
215    .queries[0]  = _SM(INST_ISSUED1_0),
216    .queries[1]  = _SM(INST_ISSUED1_1),
217    .queries[2]  = _SM(INST_ISSUED2_0),
218    .queries[3]  = _SM(INST_ISSUED2_1),
219    .queries[4]  = _SM(INST_EXECUTED),
220    .num_queries = 5,
221 };
222 
223 static const struct nvc0_hw_metric_query_cfg
224 sm21_issued_ipc =
225 {
226    .type        = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
227    .queries[0]  = _SM(INST_ISSUED1_0),
228    .queries[1]  = _SM(INST_ISSUED1_1),
229    .queries[2]  = _SM(INST_ISSUED2_0),
230    .queries[3]  = _SM(INST_ISSUED2_1),
231    .queries[4]  = _SM(ACTIVE_CYCLES),
232    .num_queries = 5,
233 };
234 
235 static const struct nvc0_hw_metric_query_cfg
236 sm21_issue_slots =
237 {
238    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
239    .queries[0]  = _SM(INST_ISSUED1_0),
240    .queries[1]  = _SM(INST_ISSUED1_1),
241    .queries[2]  = _SM(INST_ISSUED2_0),
242    .queries[3]  = _SM(INST_ISSUED2_1),
243    .num_queries = 4,
244 };
245 
246 static const struct nvc0_hw_metric_query_cfg
247 sm21_issue_slot_utilization =
248 {
249    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
250    .queries[0]  = _SM(INST_ISSUED1_0),
251    .queries[1]  = _SM(INST_ISSUED1_1),
252    .queries[2]  = _SM(INST_ISSUED2_0),
253    .queries[3]  = _SM(INST_ISSUED2_1),
254    .queries[4]  = _SM(ACTIVE_CYCLES),
255    .num_queries = 5,
256 };
257 
258 static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] =
259 {
260    &sm20_achieved_occupancy,
261    &sm20_branch_efficiency,
262    &sm21_inst_issued,
263    &sm20_inst_per_wrap,
264    &sm21_inst_replay_overhead,
265    &sm20_ipc,
266    &sm21_issued_ipc,
267    &sm21_issue_slots,
268    &sm21_issue_slot_utilization,
269 };
270 
271 /* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */
272 static const struct nvc0_hw_metric_query_cfg
273 sm30_inst_issued =
274 {
275    .type        = NVC0_HW_METRIC_QUERY_INST_ISSUED,
276    .queries[0]  = _SM(INST_ISSUED1),
277    .queries[1]  = _SM(INST_ISSUED2),
278    .num_queries = 2,
279 };
280 
281 static const struct nvc0_hw_metric_query_cfg
282 sm30_inst_replay_overhead =
283 {
284    .type        = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
285    .queries[0]  = _SM(INST_ISSUED1),
286    .queries[1]  = _SM(INST_ISSUED2),
287    .queries[2]  = _SM(INST_EXECUTED),
288    .num_queries = 3,
289 };
290 
291 static const struct nvc0_hw_metric_query_cfg
292 sm30_issued_ipc =
293 {
294    .type        = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
295    .queries[0]  = _SM(INST_ISSUED1),
296    .queries[1]  = _SM(INST_ISSUED2),
297    .queries[2]  = _SM(ACTIVE_CYCLES),
298    .num_queries = 3,
299 };
300 
301 static const struct nvc0_hw_metric_query_cfg
302 sm30_issue_slots =
303 {
304    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
305    .queries[0]  = _SM(INST_ISSUED1),
306    .queries[1]  = _SM(INST_ISSUED2),
307    .num_queries = 2,
308 };
309 
310 static const struct nvc0_hw_metric_query_cfg
311 sm30_issue_slot_utilization =
312 {
313    .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
314    .queries[0]  = _SM(INST_ISSUED1),
315    .queries[1]  = _SM(INST_ISSUED2),
316    .queries[2]  = _SM(ACTIVE_CYCLES),
317    .num_queries = 3,
318 };
319 
320 static const struct nvc0_hw_metric_query_cfg
321 sm30_shared_replay_overhead =
322 {
323    .type        = NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD,
324    .queries[0]  = _SM(SHARED_LD_REPLAY),
325    .queries[1]  = _SM(SHARED_ST_REPLAY),
326    .queries[2]  = _SM(INST_EXECUTED),
327    .num_queries = 3,
328 };
329 
330 static const struct nvc0_hw_metric_query_cfg
331 sm30_warp_execution_efficiency =
332 {
333    .type        = NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY,
334    .queries[0]  = _SM(INST_EXECUTED),
335    .queries[1]  = _SM(TH_INST_EXECUTED),
336    .num_queries = 2,
337 };
338 
339 static const struct nvc0_hw_metric_query_cfg *sm30_hw_metric_queries[] =
340 {
341    &sm20_achieved_occupancy,
342    &sm20_branch_efficiency,
343    &sm30_inst_issued,
344    &sm20_inst_per_wrap,
345    &sm30_inst_replay_overhead,
346    &sm20_ipc,
347    &sm30_issued_ipc,
348    &sm30_issue_slots,
349    &sm30_issue_slot_utilization,
350    &sm30_shared_replay_overhead,
351    &sm30_warp_execution_efficiency,
352 };
353 
354 /* ==== Compute capability 3.5 (GK110/GK208) ==== */
355 static const struct nvc0_hw_metric_query_cfg
356 sm35_warp_nonpred_execution_efficiency =
357 {
358    .type        = NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY,
359    .queries[0]  = _SM(INST_EXECUTED),
360    .queries[1]  = _SM(NOT_PRED_OFF_INST_EXECUTED),
361    .num_queries = 2,
362 };
363 
364 static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] =
365 {
366    &sm20_achieved_occupancy,
367    &sm30_inst_issued,
368    &sm20_inst_per_wrap,
369    &sm30_inst_replay_overhead,
370    &sm20_ipc,
371    &sm30_issued_ipc,
372    &sm30_issue_slots,
373    &sm30_issue_slot_utilization,
374    &sm30_shared_replay_overhead,
375    &sm30_warp_execution_efficiency,
376    &sm35_warp_nonpred_execution_efficiency,
377 };
378 
379 /* ==== Compute capability 5.0 (GM107/GM108) ==== */
380 static const struct nvc0_hw_metric_query_cfg *sm50_hw_metric_queries[] =
381 {
382    &sm20_achieved_occupancy,
383    &sm20_branch_efficiency,
384    &sm30_inst_issued,
385    &sm20_inst_per_wrap,
386    &sm30_inst_replay_overhead,
387    &sm20_ipc,
388    &sm30_issued_ipc,
389    &sm30_issue_slots,
390    &sm30_issue_slot_utilization,
391    &sm30_warp_execution_efficiency,
392    &sm35_warp_nonpred_execution_efficiency,
393 };
394 
395 #undef _SM
396 
397 static inline const struct nvc0_hw_metric_query_cfg **
nvc0_hw_metric_get_queries(struct nvc0_screen * screen)398 nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
399 {
400    struct nouveau_device *dev = screen->base.device;
401 
402    switch (screen->base.class_3d) {
403    case GM200_3D_CLASS:
404    case GM107_3D_CLASS:
405       return sm50_hw_metric_queries;
406    case NVF0_3D_CLASS:
407       return sm35_hw_metric_queries;
408    case NVE4_3D_CLASS:
409       return sm30_hw_metric_queries;
410    case NVC0_3D_CLASS:
411    case NVC1_3D_CLASS:
412    case NVC8_3D_CLASS:
413       if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
414          return sm20_hw_metric_queries;
415       return sm21_hw_metric_queries;
416    }
417    assert(0);
418    return NULL;
419 }
420 
421 unsigned
nvc0_hw_metric_get_num_queries(struct nvc0_screen * screen)422 nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen)
423 {
424    struct nouveau_device *dev = screen->base.device;
425 
426    switch (screen->base.class_3d) {
427    case GM200_3D_CLASS:
428    case GM107_3D_CLASS:
429       return ARRAY_SIZE(sm50_hw_metric_queries);
430    case NVF0_3D_CLASS:
431       return ARRAY_SIZE(sm35_hw_metric_queries);
432    case NVE4_3D_CLASS:
433       return ARRAY_SIZE(sm30_hw_metric_queries);
434    case NVC0_3D_CLASS:
435    case NVC1_3D_CLASS:
436    case NVC8_3D_CLASS:
437       if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
438          return ARRAY_SIZE(sm20_hw_metric_queries);
439       return ARRAY_SIZE(sm21_hw_metric_queries);
440    }
441    return 0;
442 }
443 
444 static const struct nvc0_hw_metric_query_cfg *
nvc0_hw_metric_query_get_cfg(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)445 nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
446 {
447    const struct nvc0_hw_metric_query_cfg **queries;
448    struct nvc0_screen *screen = nvc0->screen;
449    struct nvc0_query *q = &hq->base;
450    unsigned num_queries;
451    unsigned i;
452 
453    num_queries = nvc0_hw_metric_get_num_queries(screen);
454    queries = nvc0_hw_metric_get_queries(screen);
455 
456    for (i = 0; i < num_queries; i++) {
457       if (NVC0_HW_METRIC_QUERY(queries[i]->type) == q->type)
458          return queries[i];
459    }
460    assert(0);
461    return NULL;
462 }
463 
464 static void
nvc0_hw_metric_destroy_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)465 nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
466                              struct nvc0_hw_query *hq)
467 {
468    struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
469    unsigned i;
470 
471    for (i = 0; i < hmq->num_queries; i++)
472       if (hmq->queries[i]->funcs->destroy_query)
473          hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
474    FREE(hmq);
475 }
476 
477 static bool
nvc0_hw_metric_begin_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)478 nvc0_hw_metric_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
479 {
480    struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
481    bool ret = false;
482    unsigned i;
483 
484    for (i = 0; i < hmq->num_queries; i++) {
485       ret = hmq->queries[i]->funcs->begin_query(nvc0, hmq->queries[i]);
486       if (!ret)
487          return ret;
488    }
489    return ret;
490 }
491 
492 static void
nvc0_hw_metric_end_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)493 nvc0_hw_metric_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
494 {
495    struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
496    unsigned i;
497 
498    for (i = 0; i < hmq->num_queries; i++)
499       hmq->queries[i]->funcs->end_query(nvc0, hmq->queries[i]);
500 }
501 
502 static uint64_t
sm20_hw_metric_calc_result(struct nvc0_hw_query * hq,uint64_t res64[8])503 sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
504 {
505    switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
506    case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
507       /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
508       if (res64[1])
509          return ((res64[0] / (double)res64[1]) / 48) * 100;
510       break;
511    case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
512       /* (branch / (branch + divergent_branch)) * 100 */
513       if (res64[0] + res64[1])
514          return (res64[0] / (double)(res64[0] + res64[1])) * 100;
515       break;
516    case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
517       /* inst_executed / warps_launched */
518       if (res64[1])
519          return res64[0] / (double)res64[1];
520       break;
521    case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
522       /* (inst_issued - inst_executed) / inst_executed */
523       if (res64[1])
524          return (res64[0] - res64[1]) / (double)res64[1];
525       break;
526    case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
527       /* inst_issued / active_cycles */
528       if (res64[1])
529          return res64[0] / (double)res64[1];
530       break;
531    case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
532       /* ((inst_issued / 2) / active_cycles) * 100 */
533       if (res64[1])
534          return ((res64[0] / 2) / (double)res64[1]) * 100;
535       break;
536    case NVC0_HW_METRIC_QUERY_IPC:
537       /* inst_executed / active_cycles */
538       if (res64[1])
539          return res64[0] / (double)res64[1];
540       break;
541    default:
542       debug_printf("invalid metric type: %d\n",
543                    hq->base.type - NVC0_HW_METRIC_QUERY(0));
544       break;
545    }
546    return 0;
547 }
548 
549 static uint64_t
sm21_hw_metric_calc_result(struct nvc0_hw_query * hq,uint64_t res64[8])550 sm21_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
551 {
552    switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
553    case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
554       return sm20_hw_metric_calc_result(hq, res64);
555    case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
556       return sm20_hw_metric_calc_result(hq, res64);
557    case NVC0_HW_METRIC_QUERY_INST_ISSUED:
558       /* issued1_0 + issued1_1 + (issued2_0 + issued2_1) * 2 */
559       return res64[0] + res64[1] + (res64[2] + res64[3]) * 2;
560       break;
561    case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
562       return sm20_hw_metric_calc_result(hq, res64);
563    case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
564       /* (metric-inst_issued - inst_executed) / inst_executed */
565       if (res64[4])
566          return (((res64[0] + res64[1] + (res64[2] + res64[3]) * 2) -
567                    res64[4]) / (double)res64[4]);
568       break;
569    case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
570       /* metric-inst_issued / active_cycles */
571       if (res64[4])
572          return (res64[0] + res64[1] + (res64[2] + res64[3]) * 2) /
573                 (double)res64[4];
574       break;
575    case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS:
576       /* issued1_0 + issued1_1 + issued2_0 + issued2_1 */
577       return res64[0] + res64[1] + res64[2] + res64[3];
578       break;
579    case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
580       /* ((metric-issue_slots / 2) / active_cycles) * 100 */
581       if (res64[4])
582          return (((res64[0] + res64[1] + res64[2] + res64[3]) / 2) /
583                  (double)res64[4]) * 100;
584       break;
585    case NVC0_HW_METRIC_QUERY_IPC:
586       return sm20_hw_metric_calc_result(hq, res64);
587    default:
588       debug_printf("invalid metric type: %d\n",
589                    hq->base.type - NVC0_HW_METRIC_QUERY(0));
590       break;
591    }
592    return 0;
593 }
594 
595 static uint64_t
sm30_hw_metric_calc_result(struct nvc0_hw_query * hq,uint64_t res64[8])596 sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
597 {
598    switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
599    case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
600       /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
601       if (res64[1])
602          return ((res64[0] / (double)res64[1]) / 64) * 100;
603       break;
604    case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
605       return sm20_hw_metric_calc_result(hq, res64);
606    case NVC0_HW_METRIC_QUERY_INST_ISSUED:
607       /* inst_issued1 + inst_issued2 * 2 */
608       return res64[0] + res64[1] * 2;
609    case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
610       return sm20_hw_metric_calc_result(hq, res64);
611    case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
612       /* (metric-inst_issued - inst_executed) / inst_executed */
613       if (res64[2])
614          return (((res64[0] + res64[1] * 2) - res64[2]) / (double)res64[2]);
615       break;
616    case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
617       /* metric-inst_issued / active_cycles */
618       if (res64[2])
619          return (res64[0] + res64[1] * 2) / (double)res64[2];
620       break;
621    case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS:
622       /* inst_issued1 + inst_issued2 */
623       return res64[0] + res64[1];
624    case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
625       /* ((metric-issue_slots / 2) / active_cycles) * 100 */
626       if (res64[2])
627          return (((res64[0] + res64[1]) / 2) / (double)res64[2]) * 100;
628       break;
629    case NVC0_HW_METRIC_QUERY_IPC:
630       return sm20_hw_metric_calc_result(hq, res64);
631    case NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD:
632       /* (shared_load_replay + shared_store_replay) / inst_executed */
633       if (res64[2])
634          return (res64[0] + res64[1]) / (double)res64[2];
635       break;
636    case NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY:
637       /* thread_inst_executed / (inst_executed * max. number of threads per
638        * wrap) * 100 */
639       if (res64[0])
640          return (res64[1] / ((double)res64[0] * 32)) * 100;
641       break;
642    default:
643       debug_printf("invalid metric type: %d\n",
644                    hq->base.type - NVC0_HW_METRIC_QUERY(0));
645       break;
646    }
647    return 0;
648 }
649 
650 static uint64_t
sm35_hw_metric_calc_result(struct nvc0_hw_query * hq,uint64_t res64[8])651 sm35_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
652 {
653    switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
654    case NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY:
655       /* not_predicated_off_thread_inst_executed / (inst_executed * max. number
656        * of threads per wrap) * 100 */
657       if (res64[0])
658          return (res64[1] / ((double)res64[0] * 32)) * 100;
659       break;
660    default:
661       return sm30_hw_metric_calc_result(hq, res64);
662    }
663    return 0;
664 }
665 
666 static bool
nvc0_hw_metric_get_query_result(struct nvc0_context * nvc0,struct nvc0_hw_query * hq,bool wait,union pipe_query_result * result)667 nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
668                                 struct nvc0_hw_query *hq, bool wait,
669                                 union pipe_query_result *result)
670 {
671    struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
672    struct nvc0_screen *screen = nvc0->screen;
673    struct nouveau_device *dev = screen->base.device;
674    union pipe_query_result results[8] = {};
675    uint64_t res64[8] = {};
676    uint64_t value = 0;
677    bool ret = false;
678    unsigned i;
679 
680    for (i = 0; i < hmq->num_queries; i++) {
681       ret = hmq->queries[i]->funcs->get_query_result(nvc0, hmq->queries[i],
682                                                      wait, &results[i]);
683       if (!ret)
684          return ret;
685       res64[i] = *(uint64_t *)&results[i];
686    }
687 
688    switch (screen->base.class_3d) {
689    case GM200_3D_CLASS:
690    case GM107_3D_CLASS:
691    case NVF0_3D_CLASS:
692       value = sm35_hw_metric_calc_result(hq, res64);
693       break;
694    case NVE4_3D_CLASS:
695       value = sm30_hw_metric_calc_result(hq, res64);
696       break;
697    default:
698       if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
699          value = sm20_hw_metric_calc_result(hq, res64);
700       else
701          value = sm21_hw_metric_calc_result(hq, res64);
702       break;
703    }
704 
705    *(uint64_t *)result = value;
706    return ret;
707 }
708 
709 static const struct nvc0_hw_query_funcs hw_metric_query_funcs = {
710    .destroy_query = nvc0_hw_metric_destroy_query,
711    .begin_query = nvc0_hw_metric_begin_query,
712    .end_query = nvc0_hw_metric_end_query,
713    .get_query_result = nvc0_hw_metric_get_query_result,
714 };
715 
716 struct nvc0_hw_query *
nvc0_hw_metric_create_query(struct nvc0_context * nvc0,unsigned type)717 nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type)
718 {
719    const struct nvc0_hw_metric_query_cfg *cfg;
720    struct nvc0_hw_metric_query *hmq;
721    struct nvc0_hw_query *hq;
722    unsigned i;
723 
724    if (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST)
725       return NULL;
726 
727    hmq = CALLOC_STRUCT(nvc0_hw_metric_query);
728    if (!hmq)
729       return NULL;
730 
731    hq = &hmq->base;
732    hq->funcs = &hw_metric_query_funcs;
733    hq->base.type = type;
734 
735    cfg = nvc0_hw_metric_query_get_cfg(nvc0, hq);
736 
737    for (i = 0; i < cfg->num_queries; i++) {
738       hmq->queries[i] = nvc0_hw_sm_create_query(nvc0, cfg->queries[i]);
739       if (!hmq->queries[i]) {
740          nvc0_hw_metric_destroy_query(nvc0, hq);
741          return NULL;
742       }
743       hmq->num_queries++;
744    }
745 
746    return hq;
747 }
748 
749 int
nvc0_hw_metric_get_driver_query_info(struct nvc0_screen * screen,unsigned id,struct pipe_driver_query_info * info)750 nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
751                                      struct pipe_driver_query_info *info)
752 {
753    int count = 0;
754 
755    if (screen->base.drm->version >= 0x01000101) {
756       if (screen->compute)
757          count = nvc0_hw_metric_get_num_queries(screen);
758    }
759 
760    if (!info)
761       return count;
762 
763    if (id < count) {
764       if (screen->compute) {
765          if (screen->base.class_3d <= GM200_3D_CLASS) {
766             const struct nvc0_hw_metric_query_cfg **queries =
767                nvc0_hw_metric_get_queries(screen);
768             const struct nvc0_hw_metric_cfg *cfg =
769                nvc0_hw_metric_get_cfg(queries[id]->type);
770 
771             info->name = cfg->name;
772             info->query_type = NVC0_HW_METRIC_QUERY(queries[id]->type);
773             info->type = cfg->type;
774             info->group_id = NVC0_HW_METRIC_QUERY_GROUP;
775             return 1;
776          }
777       }
778    }
779    return 0;
780 }
781