1 /*
2 * Copyright 2011 Christoph Bumiller
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
25
26 #include "nvc0/nvc0_context.h"
27 #include "nvc0/nvc0_query_hw_sm.h"
28
29 #include "nv_object.xml.h"
30 #include "nvc0/nve4_compute.xml.h"
31 #include "nvc0/nvc0_compute.xml.h"
32
33 /* NOTE: intentionally using the same names as NV */
34 #define _Q(t, n, d) { NVC0_HW_SM_QUERY_##t, n, d }
35 static const struct {
36 unsigned type;
37 const char *name;
38 const char *desc;
39 } nvc0_hw_sm_queries[] = {
40 _Q(ACTIVE_CTAS,
41 "active_ctas",
42 "Accumulated number of active blocks per cycle. For every cycle it "
43 "increments by the number of active blocks in the cycle which can be in "
44 "the range 0 to 32."),
45
46 _Q(ACTIVE_CYCLES,
47 "active_cycles",
48 "Number of cycles a multiprocessor has at least one active warp"),
49
50 _Q(ACTIVE_WARPS,
51 "active_warps",
52 "Accumulated number of active warps per cycle. For every cycle it "
53 "increments by the number of active warps in the cycle which can be in "
54 "the range 0 to 64"),
55
56 _Q(ATOM_CAS_COUNT,
57 "atom_cas_count",
58 "Number of warps executing atomic compare and swap operations. Increments "
59 "by one if at least one thread in a warp executes the instruction."),
60
61 _Q(ATOM_COUNT,
62 "atom_count",
63 "Number of warps executing atomic reduction operations. Increments by one "
64 "if at least one thread in a warp executes the instruction"),
65
66 _Q(BRANCH,
67 "branch",
68 "Number of branch instructions executed per warp on a multiprocessor"),
69
70 _Q(DIVERGENT_BRANCH,
71 "divergent_branch",
72 "Number of divergent branches within a warp. This counter will be "
73 "incremented by one if at least one thread in a warp diverges (that is, "
74 "follows a different execution path) via a conditional branch"),
75
76 _Q(GLD_REQUEST,
77 "gld_request",
78 "Number of executed load instructions where the state space is not "
79 "specified and hence generic addressing is used, increments per warp on a "
80 "multiprocessor. It can include the load operations from global,local and "
81 "shared state space"),
82
83 _Q(GLD_MEM_DIV_REPLAY,
84 "global_ld_mem_divergence_replays",
85 "Number of instruction replays for global memory loads. Instruction is "
86 "replayed if the instruction is accessing more than one cache line of "
87 "128 bytes. For each extra cache line access the counter is incremented "
88 "by 1"),
89
90 _Q(GLOBAL_ATOM_CAS,
91 "global_atom_cas",
92 "Number of ATOM.CAS instructions executed per warp."),
93
94 _Q(GLOBAL_LD,
95 "global_load",
96 "Number of executed load instructions where state space is specified as "
97 "global, increments per warp on a multiprocessor."),
98
99 _Q(GLOBAL_ST,
100 "global_store",
101 "Number of executed store instructions where state space is specified as "
102 "global, increments per warp on a multiprocessor."),
103
104 _Q(GST_TRANSACTIONS,
105 "global_store_transaction",
106 "Number of global store transactions. Increments by 1 per transaction. "
107 "Transaction can be 32/64/96/128B"),
108
109 _Q(GST_MEM_DIV_REPLAY,
110 "global_st_mem_divergence_replays",
111 "Number of instruction replays for global memory stores. Instruction is "
112 "replayed if the instruction is accessing more than one cache line of "
113 "128 bytes. For each extra cache line access the counter is incremented "
114 "by 1"),
115
116 _Q(GRED_COUNT,
117 "gred_count",
118 "Number of warps executing reduction operations on global memory. "
119 "Increments by one if at least one thread in a warp executes the "
120 "instruction"),
121
122 _Q(GST_REQUEST,
123 "gst_request",
124 "Number of executed store instructions where the state space is not "
125 "specified and hence generic addressing is used, increments per warp on a "
126 "multiprocessor. It can include the store operations to global,local and "
127 "shared state space"),
128
129 _Q(INST_EXECUTED,
130 "inst_executed",
131 "Number of instructions executed, do not include replays"),
132
133 _Q(INST_ISSUED,
134 "inst_issued",
135 "Number of instructions issued including replays"),
136
137 _Q(INST_ISSUED0,
138 "inst_issued0",
139 "Number of cycles that did not issue any instruction, increments per "
140 "warp."),
141
142 _Q(INST_ISSUED1,
143 "inst_issued1",
144 "Number of single instruction issued per cycle"),
145
146 _Q(INST_ISSUED2,
147 "inst_issued2",
148 "Number of dual instructions issued per cycle"),
149
150 _Q(INST_ISSUED1_0,
151 "inst_issued1_0",
152 "Number of single instruction issued per cycle in pipeline 0"),
153
154 _Q(INST_ISSUED1_1,
155 "inst_issued1_1",
156 "Number of single instruction issued per cycle in pipeline 1"),
157
158 _Q(INST_ISSUED2_0,
159 "inst_issued2_0",
160 "Number of dual instructions issued per cycle in pipeline 0"),
161
162 _Q(INST_ISSUED2_1,
163 "inst_issued2_1",
164 "Number of dual instructions issued per cycle in pipeline 1"),
165
166 _Q(L1_GLD_HIT,
167 "l1_global_load_hit",
168 "Number of cache lines that hit in L1 cache for global memory load "
169 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
170 "32, 64 and 128 bit accesses by a warp respectively"),
171
172 _Q(L1_GLD_MISS,
173 "l1_global_load_miss",
174 "Number of cache lines that miss in L1 cache for global memory load "
175 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
176 "32, 64 and 128 bit accesses by a warp respectively"),
177
178 _Q(L1_GLD_TRANSACTIONS,
179 "__l1_global_load_transactions",
180 "Number of global load transactions from L1 cache. Increments by 1 per "
181 "transaction. Transaction can be 32/64/96/128B"),
182
183 _Q(L1_GST_TRANSACTIONS,
184 "__l1_global_store_transactions",
185 "Number of global store transactions from L1 cache. Increments by 1 per "
186 "transaction. Transaction can be 32/64/96/128B"),
187
188 _Q(L1_LOCAL_LD_HIT,
189 "l1_local_load_hit",
190 "Number of cache lines that hit in L1 cache for local memory load "
191 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
192 "32, 64 and 128 bit accesses by a warp respectively"),
193
194 _Q(L1_LOCAL_LD_MISS,
195 "l1_local_load_miss",
196 "Number of cache lines that miss in L1 cache for local memory load "
197 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
198 "32, 64 and 128 bit accesses by a warp respectively"),
199
200 _Q(L1_LOCAL_ST_HIT,
201 "l1_local_store_hit",
202 "Number of cache lines that hit in L1 cache for local memory store "
203 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
204 "32, 64 and 128 bit accesses by a warp respectively"),
205
206 _Q(L1_LOCAL_ST_MISS,
207 "l1_local_store_miss",
208 "Number of cache lines that miss in L1 cache for local memory store "
209 "accesses. In case of perfect coalescing this increments by 1,2, and 4 for "
210 "32,64 and 128 bit accesses by a warp respectively"),
211
212 _Q(L1_SHARED_LD_TRANSACTIONS,
213 "l1_shared_load_transactions",
214 "Number of shared load transactions. Increments by 1 per transaction. "
215 "Transaction can be 32/64/96/128B"),
216
217 _Q(L1_SHARED_ST_TRANSACTIONS,
218 "l1_shared_store_transactions",
219 "Number of shared store transactions. Increments by 1 per transaction. "
220 "Transaction can be 32/64/96/128B"),
221
222 _Q(LOCAL_LD,
223 "local_load",
224 "Number of executed load instructions where state space is specified as "
225 "local, increments per warp on a multiprocessor"),
226
227 _Q(LOCAL_LD_TRANSACTIONS,
228 "local_load_transactions",
229 "Number of local load transactions from L1 cache. Increments by 1 per "
230 "transaction. Transaction can be 32/64/96/128B"),
231
232 _Q(LOCAL_ST,
233 "local_store",
234 "Number of executed store instructions where state space is specified as "
235 "local, increments per warp on a multiprocessor"),
236
237 _Q(LOCAL_ST_TRANSACTIONS,
238 "local_store_transactions",
239 "Number of local store transactions to L1 cache. Increments by 1 per "
240 "transaction. Transaction can be 32/64/96/128B."),
241
242 _Q(NOT_PRED_OFF_INST_EXECUTED,
243 "not_predicated_off_thread_inst_executed",
244 "Number of not predicated off instructions executed by all threads, does "
245 "not include replays. For each instruction it increments by the number of "
246 "threads that execute this instruction"),
247
248 _Q(PROF_TRIGGER_0,
249 "prof_trigger_00",
250 "User profiled generic trigger that can be inserted in any place of the "
251 "code to collect the related information. Increments per warp."),
252
253 _Q(PROF_TRIGGER_1,
254 "prof_trigger_01",
255 "User profiled generic trigger that can be inserted in any place of the "
256 "code to collect the related information. Increments per warp."),
257
258 _Q(PROF_TRIGGER_2,
259 "prof_trigger_02",
260 "User profiled generic trigger that can be inserted in any place of the "
261 "code to collect the related information. Increments per warp."),
262
263 _Q(PROF_TRIGGER_3,
264 "prof_trigger_03",
265 "User profiled generic trigger that can be inserted in any place of the "
266 "code to collect the related information. Increments per warp."),
267
268 _Q(PROF_TRIGGER_4,
269 "prof_trigger_04",
270 "User profiled generic trigger that can be inserted in any place of the "
271 "code to collect the related information. Increments per warp."),
272
273 _Q(PROF_TRIGGER_5,
274 "prof_trigger_05",
275 "User profiled generic trigger that can be inserted in any place of the "
276 "code to collect the related information. Increments per warp."),
277
278 _Q(PROF_TRIGGER_6,
279 "prof_trigger_06",
280 "User profiled generic trigger that can be inserted in any place of the "
281 "code to collect the related information. Increments per warp."),
282
283 _Q(PROF_TRIGGER_7,
284 "prof_trigger_07",
285 "User profiled generic trigger that can be inserted in any place of the "
286 "code to collect the related information. Increments per warp."),
287
288 _Q(SHARED_ATOM,
289 "shared_atom",
290 "Number of ATOMS instructions executed per warp."),
291
292 _Q(SHARED_ATOM_CAS,
293 "shared_atom_cas",
294 "Number of ATOMS.CAS instructions executed per warp."),
295
296 _Q(SHARED_LD,
297 "shared_load",
298 "Number of executed load instructions where state space is specified as "
299 "shared, increments per warp on a multiprocessor"),
300
301 _Q(SHARED_LD_BANK_CONFLICT,
302 "shared_load_bank_conflict",
303 "Number of shared load bank conflict generated when the addresses for "
304 "two or more shared memory load requests fall in the same memory bank."),
305
306 _Q(SHARED_LD_REPLAY,
307 "shared_load_replay",
308 "Replays caused due to shared load bank conflict (when the addresses for "
309 "two or more shared memory load requests fall in the same memory bank) or "
310 "when there is no conflict but the total number of words accessed by all "
311 "threads in the warp executing that instruction exceed the number of words "
312 "that can be loaded in one cycle (256 bytes)"),
313
314 _Q(SHARED_LD_TRANSACTIONS,
315 "shared_ld_transactions",
316 "Number of transactions for shared load accesses. Maximum transaction "
317 "size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
318 "will cause multiple transactions for a shared load instruction. This "
319 "also includes extra transactions caused by shared bank conflicts."),
320
321 _Q(SHARED_ST,
322 "shared_store",
323 "Number of executed store instructions where state space is specified as "
324 "shared, increments per warp on a multiprocessor"),
325
326 _Q(SHARED_ST_BANK_CONFLICT,
327 "shared_store_bank_conflict",
328 "Number of shared store bank conflict generated when the addresses for "
329 "two or more shared memory store requests fall in the same memory bank."),
330
331 _Q(SHARED_ST_REPLAY,
332 "shared_store_replay",
333 "Replays caused due to shared store bank conflict (when the addresses for "
334 "two or more shared memory store requests fall in the same memory bank) or "
335 "when there is no conflict but the total number of words accessed by all "
336 "threads in the warp executing that instruction exceed the number of words "
337 "that can be stored in one cycle"),
338
339 _Q(SHARED_ST_TRANSACTIONS,
340 "shared_st_transactions",
341 "Number of transactions for shared store accesses. Maximum transaction "
342 "size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
343 "will cause multiple transactions for a shared store instruction. This "
344 "also includes extra transactions caused by shared bank conflicts."),
345
346 _Q(SM_CTA_LAUNCHED,
347 "sm_cta_launched",
348 "Number of thread blocks launched on a multiprocessor"),
349
350 _Q(THREADS_LAUNCHED,
351 "threads_launched",
352 "Number of threads launched on a multiprocessor"),
353
354 _Q(TH_INST_EXECUTED,
355 "thread_inst_executed",
356 "Number of instructions executed by all threads, does not include "
357 "replays. For each instruction it increments by the number of threads in "
358 "the warp that execute the instruction"),
359
360 _Q(TH_INST_EXECUTED_0,
361 "thread_inst_executed_0",
362 "Number of instructions executed by all threads, does not include "
363 "replays. For each instruction it increments by the number of threads in "
364 "the warp that execute the instruction in pipeline 0"),
365
366 _Q(TH_INST_EXECUTED_1,
367 "thread_inst_executed_1",
368 "Number of instructions executed by all threads, does not include "
369 "replays. For each instruction it increments by the number of threads in "
370 "the warp that execute the instruction in pipeline 1"),
371
372 _Q(TH_INST_EXECUTED_2,
373 "thread_inst_executed_2",
374 "Number of instructions executed by all threads, does not include "
375 "replays. For each instruction it increments by the number of threads in "
376 "the warp that execute the instruction in pipeline 2"),
377
378 _Q(TH_INST_EXECUTED_3,
379 "thread_inst_executed_3",
380 "Number of instructions executed by all threads, does not include "
381 "replays. For each instruction it increments by the number of threads in "
382 "the warp that execute the instruction in pipeline 3"),
383
384 _Q(UNCACHED_GLD_TRANSACTIONS,
385 "uncached_global_load_transaction",
386 "Number of uncached global load transactions. Increments by 1 per "
387 "transaction. Transaction can be 32/64/96/128B."),
388
389 _Q(WARPS_LAUNCHED,
390 "warps_launched",
391 "Number of warps launched on a multiprocessor"),
392 };
393
394 #undef _Q
395
396 static inline const char *
nvc0_hw_sm_query_get_name(unsigned query_type)397 nvc0_hw_sm_query_get_name(unsigned query_type)
398 {
399 unsigned i;
400
401 for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) {
402 if (nvc0_hw_sm_queries[i].type == query_type)
403 return nvc0_hw_sm_queries[i].name;
404 }
405 assert(0);
406 return NULL;
407 }
408
409 /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
410
411 /* Code to read out MP counters: They are accessible via mmio, too, but let's
412 * just avoid mapping registers in userspace. We'd have to know which MPs are
413 * enabled/present, too, and that information is not presently exposed.
414 * We could add a kernel interface for it, but reading the counters like this
415 * has the advantage of being async (if get_result isn't called immediately).
416 */
417 static const uint64_t nve4_read_hw_sm_counters_code[] =
418 {
419 /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
420 * mov b32 $r8 $tidx
421 * mov b32 $r12 $physid
422 * mov b32 $r0 $pm0
423 * mov b32 $r1 $pm1
424 * mov b32 $r2 $pm2
425 * mov b32 $r3 $pm3
426 * mov b32 $r4 $pm4
427 * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
428 * mov b32 $r5 $pm5
429 * mov b32 $r6 $pm6
430 * mov b32 $r7 $pm7
431 * set $p0 0x1 eq u32 $r8 0x0
432 * mov b32 $r10 c7[0x6a0]
433 * ext u32 $r8 $r12 0x414
434 * mov b32 $r11 c7[0x6a4]
435 * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
436 * ext u32 $r9 $r12 0x208
437 * (not $p0) exit
438 * set $p1 0x1 eq u32 $r9 0x0
439 * mul $r8 u32 $r8 u32 96
440 * mul $r12 u32 $r9 u32 16
441 * mul $r13 u32 $r9 u32 4
442 * add b32 $r9 $r8 $r13
443 * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
444 * add b32 $r8 $r8 $r12
445 * mov b32 $r12 $r10
446 * add b32 $r10 $c $r10 $r8
447 * mov b32 $r13 $r11
448 * add b32 $r11 $r11 0x0 $c
449 * add b32 $r12 $c $r12 $r9
450 * st b128 wt g[$r10d] $r0q
451 * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
452 * mov b32 $r0 c7[0x6a8]
453 * add b32 $r13 $r13 0x0 $c
454 * $p1 st b128 wt g[$r12d+0x40] $r4q
455 * st b32 wt g[$r12d+0x50] $r0
456 * exit */
457 0x2202020202020207ULL,
458 0x2c00000084021c04ULL,
459 0x2c0000000c031c04ULL,
460 0x2c00000010001c04ULL,
461 0x2c00000014005c04ULL,
462 0x2c00000018009c04ULL,
463 0x2c0000001c00dc04ULL,
464 0x2c00000020011c04ULL,
465 0x22b0420042320207ULL,
466 0x2c00000024015c04ULL,
467 0x2c00000028019c04ULL,
468 0x2c0000002c01dc04ULL,
469 0x190e0000fc81dc03ULL,
470 0x28005c1a80029de4ULL,
471 0x7000c01050c21c03ULL,
472 0x28005c1a9002dde4ULL,
473 0x204282020042e047ULL,
474 0x7000c00820c25c03ULL,
475 0x80000000000021e7ULL,
476 0x190e0000fc93dc03ULL,
477 0x1000000180821c02ULL,
478 0x1000000040931c02ULL,
479 0x1000000010935c02ULL,
480 0x4800000034825c03ULL,
481 0x22c042c042c04287ULL,
482 0x4800000030821c03ULL,
483 0x2800000028031de4ULL,
484 0x4801000020a29c03ULL,
485 0x280000002c035de4ULL,
486 0x0800000000b2dc42ULL,
487 0x4801000024c31c03ULL,
488 0x9400000000a01fc5ULL,
489 0x200002e04202c047ULL,
490 0x28005c1aa0001de4ULL,
491 0x0800000000d35c42ULL,
492 0x9400000100c107c5ULL,
493 0x9400000140c01f85ULL,
494 0x8000000000001de7ULL
495 };
496
497 static const uint64_t nvf0_read_hw_sm_counters_code[] =
498 {
499 /* Same kernel as GK104 */
500 0x0880808080808080ULL,
501 0x86400000109c0022ULL,
502 0x86400000019c0032ULL,
503 0x86400000021c0002ULL,
504 0x86400000029c0006ULL,
505 0x86400000031c000aULL,
506 0x86400000039c000eULL,
507 0x86400000041c0012ULL,
508 0x08ac1080108c8080ULL,
509 0x86400000049c0016ULL,
510 0x86400000051c001aULL,
511 0x86400000059c001eULL,
512 0xdb201c007f9c201eULL,
513 0x64c03ce0d41c002aULL,
514 0xc00000020a1c3021ULL,
515 0x64c03ce0d49c002eULL,
516 0x0810a0808010b810ULL,
517 0xc0000001041c3025ULL,
518 0x180000000020003cULL,
519 0xdb201c007f9c243eULL,
520 0xc1c00000301c2021ULL,
521 0xc1c00000081c2431ULL,
522 0xc1c00000021c2435ULL,
523 0xe0800000069c2026ULL,
524 0x08b010b010b010a0ULL,
525 0xe0800000061c2022ULL,
526 0xe4c03c00051c0032ULL,
527 0xe0840000041c282aULL,
528 0xe4c03c00059c0036ULL,
529 0xe08040007f9c2c2eULL,
530 0xe0840000049c3032ULL,
531 0xfe800000001c2800ULL,
532 0x080000b81080b010ULL,
533 0x64c03ce0d51c0002ULL,
534 0xe08040007f9c3436ULL,
535 0xfe80000020043010ULL,
536 0xfc800000281c3000ULL,
537 0x18000000001c003cULL,
538 };
539
540 static const uint64_t gm107_read_hw_sm_counters_code[] =
541 {
542 0x001d0400e4200701ULL, /* sched (st 0x1 wr 0x0) (st 0x1 wr 0x1) (st 0x1 wr 0x2) */
543 0xf0c8000002170008ULL, /* mov $r8 $tidx */
544 0xf0c800000037000cULL, /* mov $r12 $virtid */
545 0xf0c8000000470000ULL, /* mov $r0 $pm0 */
546 0x001e8400f0200761ULL, /* sched (st 0x1 wr 0x3) (st 0x1 wr 0x4) (st 0x1 wr 0x5) */
547 0xf0c8000000570001ULL, /* mov $r1 $pm1 */
548 0xf0c8000000670002ULL, /* mov $r2 $pm2 */
549 0xf0c8000000770003ULL, /* mov $r3 $pm3 */
550 0x001e8400f42007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wr 0x5) (st 0x1 wr 0x5) */
551 0xf0c8000000870004ULL, /* mov $r4 $pm4 */
552 0xf0c8000000970005ULL, /* mov $r5 $pm5 */
553 0xf0c8000000a70006ULL, /* mov $r6 $pm6 */
554 0x001f8401fc2007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wt 0x1) (st 0x1) */
555 0xf0c8000000b70007ULL, /* mov $r7 $pm7 */
556 0x5b6403800087ff07ULL, /* isetp eq u32 and $p0 0x1 0x0 $r8 0x1 */
557 0x4c98079c1a87000aULL, /* mov $r10 c7[0x6a0] 0xf */
558 0x001fa400fc2017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x1) (st 0x9) */
559 0x3800000091470c08ULL, /* bfe u32 $r8 $r12 0x914 */
560 0x4c98079c1a97000bULL, /* mov $r11 c7[0x6a4] 0xf */
561 0x3800000020870c09ULL, /* bfe u32 $r9 $r12 0x208 */
562 0x001c1800fc2007edULL, /* sched (st 0xd) (st 0x1) (st 0x6 wr 0x0) */
563 0xe30000000008000fULL, /* not $p0 exit */
564 0x5b6403800097ff0fULL, /* isetp eq u32 and $p1 0x1 0x0 $r9 0x1 */
565 0x3838000006070808ULL, /* imul u32 u32 $r8 $r8 0x60 */
566 0x003f8400e0c00726ULL, /* sched (st 0x6 wr 0x1) (st 0x6 wr 0x0) (st 0x1 wt 0x1) */
567 0x383800000107090cULL, /* imul u32 u32 $r12 $r9 0x10 */
568 0x383800000047090dULL, /* imul u32 u32 $r13 $r9 0x4 */
569 0x5c10000000d70809ULL, /* iadd $r9 $r8 $r13 */
570 0x001f8400fcc017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x6) (st 0x1) */
571 0x5c10000000c70808ULL, /* iadd $r8 $r8 $r12 */
572 0x5c98078000a7000cULL, /* mov $r12 $r10 0xf */
573 0x5c10800000870a0aULL, /* iadd cc $r10 $r10 $r8 */
574 0x001f8400fc2007e6ULL, /* sched (st 0x6) (st 0x1) (st 0x1) */
575 0x5c98078000b7000dULL, /* mov $r13 $r11 0xf */
576 0x5c1008000ff70b0bULL, /* iadd x $r11 $r11 0x0 */
577 0x5c10800000970c0cULL, /* iadd cc $r12 $r12 $r9 */
578 0x003f983c1c4007e1ULL, /* sched (st 0x1) (st 0x2 rd 0x0 wt 0x3c) (st 0x6 wt 0x1) */
579 0x5c1008000ff70d0dULL, /* iadd x $r13 $r13 0x0 */
580 0xbfd0000000070a00ULL, /* st e wt b128 g[$r10] $r0 0x1 */
581 0x4c98079c1aa70000ULL, /* mov $r0 c7[0x6a8] 0xf */
582 0x001fbc00fc2007e6ULL, /* sched (st 0x1) (st 0x1) (st 0xf) */
583 0xbfd0000004010c04ULL, /* $p1 st e wt b128 g[$r12+0x40] $r4 0x1 */
584 0xbf90000005070c00ULL, /* st e wt b32 g[$r12+0x50] $r0 0x1 */
585 0xe30000000007000fULL, /* exit */
586 };
587
588 /* For simplicity, we will allocate as many group slots as we allocate counter
589 * slots. This means that a single counter which wants to source from 2 groups
590 * will have to be declared as using 2 counter slots. This shouldn't really be
591 * a problem because such queries don't make much sense ... (unless someone is
592 * really creative).
593 */
594 struct nvc0_hw_sm_counter_cfg
595 {
596 uint32_t func : 16; /* mask or 4-bit logic op (depending on mode) */
597 uint32_t mode : 4; /* LOGOP,B6,LOGOP_B6(_PULSE) */
598 uint32_t sig_dom : 1; /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
599 uint32_t sig_sel : 8; /* signal group */
600 uint32_t src_mask; /* mask for signal selection (only for NVC0:NVE4) */
601 uint32_t src_sel; /* signal selection for up to 4 sources */
602 };
603
604 struct nvc0_hw_sm_query_cfg
605 {
606 unsigned type;
607 struct nvc0_hw_sm_counter_cfg ctr[8];
608 uint8_t num_counters;
609 uint8_t norm[2]; /* normalization num,denom */
610 };
611
612 #define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, g, 0, s }
613 #define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, g, 0, s }
614 #define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c
615
616 /* ==== Compute capability 3.0 (GK104:GK110) ==== */
617 static const struct nvc0_hw_sm_query_cfg
618 sm30_active_cycles =
619 {
620 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
621 .ctr[0] = _CB(0x0001, B6, 0x02, 0x00000000),
622 .num_counters = 1,
623 .norm = { 1, 1 },
624 };
625
626 static const struct nvc0_hw_sm_query_cfg
627 sm30_active_warps =
628 {
629 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
630 .ctr[0] = _CB(0x003f, B6, 0x02, 0x31483104),
631 .num_counters = 1,
632 .norm = { 2, 1 },
633 };
634
635 static const struct nvc0_hw_sm_query_cfg
636 sm30_atom_cas_count =
637 {
638 .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
639 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x000000004),
640 .num_counters = 1,
641 .norm = { 1, 1 },
642 };
643
644 static const struct nvc0_hw_sm_query_cfg
645 sm30_atom_count =
646 {
647 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
648 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000000),
649 .num_counters = 1,
650 .norm = { 1, 1 },
651 };
652
653 static const struct nvc0_hw_sm_query_cfg
654 sm30_branch =
655 {
656 .type = NVC0_HW_SM_QUERY_BRANCH,
657 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x0000000c),
658 .num_counters = 1,
659 .norm = { 1, 1 },
660 };
661
662 static const struct nvc0_hw_sm_query_cfg
663 sm30_divergent_branch =
664 {
665 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
666 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000010),
667 .num_counters = 1,
668 .norm = { 1, 1 },
669 };
670
671 static const struct nvc0_hw_sm_query_cfg
672 sm30_gld_request =
673 {
674 .type = NVC0_HW_SM_QUERY_GLD_REQUEST,
675 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000010),
676 .num_counters = 1,
677 .norm = { 1, 1 },
678 };
679
680 static const struct nvc0_hw_sm_query_cfg
681 sm30_gld_mem_div_replay =
682 {
683 .type = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
684 .ctr[0] = _CB(0x0001, B6, 0x08, 0x00000010),
685 .num_counters = 1,
686 .norm = { 1, 1 },
687 };
688
689 static const struct nvc0_hw_sm_query_cfg
690 sm30_gst_transactions =
691 {
692 .type = NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
693 .ctr[0] = _CB(0x0001, B6, 0x11, 0x00000004),
694 .num_counters = 1,
695 .norm = { 1, 1 },
696 };
697
698 static const struct nvc0_hw_sm_query_cfg
699 sm30_gst_mem_div_replay =
700 {
701 .type = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
702 .ctr[0] = _CB(0x0001, B6, 0x08, 0x00000014),
703 .num_counters = 1,
704 .norm = { 1, 1 },
705 };
706
707 static const struct nvc0_hw_sm_query_cfg
708 sm30_gred_count =
709 {
710 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
711 .ctr[0] = _CA(0x0001, B6, 0x1c, 0x00000008),
712 .num_counters = 1,
713 .norm = { 1, 1 },
714 };
715
716 static const struct nvc0_hw_sm_query_cfg
717 sm30_gst_request =
718 {
719 .type = NVC0_HW_SM_QUERY_GST_REQUEST,
720 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000014),
721 .num_counters = 1,
722 .norm = { 1, 1 },
723 };
724
725 static const struct nvc0_hw_sm_query_cfg
726 sm30_inst_executed =
727 {
728 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
729 .ctr[0] = _CA(0x0003, B6, 0x04, 0x00000398),
730 .num_counters = 1,
731 .norm = { 1, 1 },
732 };
733
734 static const struct nvc0_hw_sm_query_cfg
735 sm30_inst_issued1 =
736 {
737 .type = NVC0_HW_SM_QUERY_INST_ISSUED1,
738 .ctr[0] = _CA(0x0001, B6, 0x05, 0x00000004),
739 .num_counters = 1,
740 .norm = { 1, 1 },
741 };
742
743 static const struct nvc0_hw_sm_query_cfg
744 sm30_inst_issued2 =
745 {
746 .type = NVC0_HW_SM_QUERY_INST_ISSUED2,
747 .ctr[0] = _CA(0x0001, B6, 0x05, 0x00000008),
748 .num_counters = 1,
749 .norm = { 1, 1 },
750 };
751
752 static const struct nvc0_hw_sm_query_cfg
753 sm30_l1_gld_hit =
754 {
755 .type = NVC0_HW_SM_QUERY_L1_GLD_HIT,
756 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000010),
757 .num_counters = 1,
758 .norm = { 1, 1 },
759 };
760
761 static const struct nvc0_hw_sm_query_cfg
762 sm30_l1_gld_miss =
763 {
764 .type = NVC0_HW_SM_QUERY_L1_GLD_MISS,
765 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000014),
766 .num_counters = 1,
767 .norm = { 1, 1 },
768 };
769
770 static const struct nvc0_hw_sm_query_cfg
771 sm30_l1_gld_transactions =
772 {
773 .type = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS,
774 .ctr[0] = _CB(0x0001, B6, 0x0f, 0x00000000),
775 .num_counters = 1,
776 .norm = { 1, 1 },
777 };
778
779 static const struct nvc0_hw_sm_query_cfg
780 sm30_l1_gst_transactions =
781 {
782 .type = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS,
783 .ctr[0] = _CB(0x0001, B6, 0x0f, 0x00000004),
784 .num_counters = 1,
785 .norm = { 1, 1 },
786 };
787
788 static const struct nvc0_hw_sm_query_cfg
789 sm30_l1_local_ld_hit =
790 {
791 .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT,
792 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000000),
793 .num_counters = 1,
794 .norm = { 1, 1 },
795 };
796
797 static const struct nvc0_hw_sm_query_cfg
798 sm30_l1_local_ld_miss =
799 {
800 .type = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS,
801 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000004),
802 .num_counters = 1,
803 .norm = { 1, 1 },
804 };
805
806 static const struct nvc0_hw_sm_query_cfg
807 sm30_l1_local_st_hit =
808 {
809 .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT,
810 .ctr[0] = _CB(0x0001, B6, 0x10, 0x00000008),
811 .num_counters = 1,
812 .norm = { 1, 1 },
813 };
814
815 static const struct nvc0_hw_sm_query_cfg
816 sm30_l1_local_st_miss =
817 {
818 .type = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS,
819 .ctr[0] = _CB(0x0001, B6, 0x10, 0x0000000c),
820 .num_counters = 1,
821 .norm = { 1, 1 },
822 };
823
824 static const struct nvc0_hw_sm_query_cfg
825 sm30_l1_shared_ld_transactions =
826 {
827 .type = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
828 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008),
829 .num_counters = 1,
830 .norm = { 1, 1 },
831 };
832
833 static const struct nvc0_hw_sm_query_cfg
834 sm30_l1_shared_st_transactions =
835 {
836 .type = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
837 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c),
838 .num_counters = 1,
839 .norm = { 1, 1 },
840 };
841
842 static const struct nvc0_hw_sm_query_cfg
843 sm30_local_ld =
844 {
845 .type = NVC0_HW_SM_QUERY_LOCAL_LD,
846 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000008),
847 .num_counters = 1,
848 .norm = { 1, 1 },
849 };
850
851 static const struct nvc0_hw_sm_query_cfg
852 sm30_local_ld_transactions =
853 {
854 .type = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
855 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000),
856 .num_counters = 1,
857 .norm = { 1, 1 },
858 };
859
860 static const struct nvc0_hw_sm_query_cfg
861 sm30_local_st =
862 {
863 .type = NVC0_HW_SM_QUERY_LOCAL_ST,
864 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x0000000c),
865 .num_counters = 1,
866 .norm = { 1, 1 },
867 };
868
869 static const struct nvc0_hw_sm_query_cfg
870 sm30_local_st_transactions =
871 {
872 .type = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
873 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004),
874 .num_counters = 1,
875 .norm = { 1, 1 },
876 };
877
878 static const struct nvc0_hw_sm_query_cfg
879 sm30_prof_trigger_0 =
880 {
881 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
882 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000000),
883 .num_counters = 1,
884 .norm = { 1, 1 },
885 };
886
887 static const struct nvc0_hw_sm_query_cfg
888 sm30_prof_trigger_1 =
889 {
890 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
891 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000004),
892 .num_counters = 1,
893 .norm = { 1, 1 },
894 };
895
896 static const struct nvc0_hw_sm_query_cfg
897 sm30_prof_trigger_2 =
898 {
899 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
900 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000008),
901 .num_counters = 1,
902 .norm = { 1, 1 },
903 };
904
905 static const struct nvc0_hw_sm_query_cfg
906 sm30_prof_trigger_3 =
907 {
908 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
909 .ctr[0] = _CA(0x0001, B6, 0x01, 0x0000000c),
910 .num_counters = 1,
911 .norm = { 1, 1 },
912 };
913
914 static const struct nvc0_hw_sm_query_cfg
915 sm30_prof_trigger_4 =
916 {
917 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
918 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000010),
919 .num_counters = 1,
920 .norm = { 1, 1 },
921 };
922
923 static const struct nvc0_hw_sm_query_cfg
924 sm30_prof_trigger_5 =
925 {
926 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
927 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000014),
928 .num_counters = 1,
929 .norm = { 1, 1 },
930 };
931
932 static const struct nvc0_hw_sm_query_cfg
933 sm30_prof_trigger_6 =
934 {
935 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
936 .ctr[0] = _CA(0x0001, B6, 0x01, 0x00000018),
937 .num_counters = 1,
938 .norm = { 1, 1 },
939 };
940
941 static const struct nvc0_hw_sm_query_cfg
942 sm30_prof_trigger_7 =
943 {
944 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
945 .ctr[0] = _CA(0x0001, B6, 0x01, 0x0000001c),
946 .num_counters = 1,
947 .norm = { 1, 1 },
948 };
949
950 static const struct nvc0_hw_sm_query_cfg
951 sm30_shared_ld =
952 {
953 .type = NVC0_HW_SM_QUERY_SHARED_LD,
954 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000000),
955 .num_counters = 1,
956 .norm = { 1, 1 },
957 };
958
959 static const struct nvc0_hw_sm_query_cfg
960 sm30_shared_ld_replay =
961 {
962 .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
963 .ctr[0] = _CB(0x0001, B6, 0x08, 0x00000008),
964 .num_counters = 1,
965 .norm = { 1, 1 },
966 };
967
968 static const struct nvc0_hw_sm_query_cfg
969 sm30_shared_st =
970 {
971 .type = NVC0_HW_SM_QUERY_SHARED_ST,
972 .ctr[0] = _CA(0x0001, B6, 0x1b, 0x00000004),
973 .num_counters = 1,
974 .norm = { 1, 1 },
975 };
976
977 static const struct nvc0_hw_sm_query_cfg
978 sm30_shared_st_replay =
979 {
980 .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
981 .ctr[0] = _CB(0x0001, B6, 0x08, 0x0000000c),
982 .num_counters = 1,
983 .norm = { 1, 1 },
984 };
985
986 static const struct nvc0_hw_sm_query_cfg
987 sm30_sm_cta_launched =
988 {
989 .type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
990 .ctr[0] = _CB(0x0001, B6, 0x02, 0x0000001c),
991 .num_counters = 1,
992 .norm = { 1, 1 },
993 };
994
995 static const struct nvc0_hw_sm_query_cfg
996 sm30_threads_launched =
997 {
998 .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
999 .ctr[0] = _CA(0x003f, B6, 0x03, 0x398a4188),
1000 .num_counters = 1,
1001 .norm = { 1, 1 },
1002 };
1003
1004 static const struct nvc0_hw_sm_query_cfg
1005 sm30_uncached_gld_transactions =
1006 {
1007 .type = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
1008 .ctr[0] = _CB(0x0001, B6, 0x11, 0x00000000),
1009 .num_counters = 1,
1010 .norm = { 1, 1 },
1011 };
1012
1013 static const struct nvc0_hw_sm_query_cfg
1014 sm30_warps_launched =
1015 {
1016 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
1017 .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004),
1018 .num_counters = 1,
1019 .norm = { 1, 1 },
1020 };
1021
1022 /* NOTES:
1023 * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
1024 * inst_executed etc.: we only count a single warp scheduler
1025 */
1026 static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] =
1027 {
1028 &sm30_active_cycles,
1029 &sm30_active_warps,
1030 &sm30_atom_cas_count,
1031 &sm30_atom_count,
1032 &sm30_branch,
1033 &sm30_divergent_branch,
1034 &sm30_gld_request,
1035 &sm30_gld_mem_div_replay,
1036 &sm30_gst_transactions,
1037 &sm30_gst_mem_div_replay,
1038 &sm30_gred_count,
1039 &sm30_gst_request,
1040 &sm30_inst_executed,
1041 &sm30_inst_issued1,
1042 &sm30_inst_issued2,
1043 &sm30_l1_gld_hit,
1044 &sm30_l1_gld_miss,
1045 &sm30_l1_gld_transactions,
1046 &sm30_l1_gst_transactions,
1047 &sm30_l1_local_ld_hit,
1048 &sm30_l1_local_ld_miss,
1049 &sm30_l1_local_st_hit,
1050 &sm30_l1_local_st_miss,
1051 &sm30_l1_shared_ld_transactions,
1052 &sm30_l1_shared_st_transactions,
1053 &sm30_local_ld,
1054 &sm30_local_ld_transactions,
1055 &sm30_local_st,
1056 &sm30_local_st_transactions,
1057 &sm30_prof_trigger_0,
1058 &sm30_prof_trigger_1,
1059 &sm30_prof_trigger_2,
1060 &sm30_prof_trigger_3,
1061 &sm30_prof_trigger_4,
1062 &sm30_prof_trigger_5,
1063 &sm30_prof_trigger_6,
1064 &sm30_prof_trigger_7,
1065 &sm30_shared_ld,
1066 &sm30_shared_ld_replay,
1067 &sm30_shared_st,
1068 &sm30_shared_st_replay,
1069 &sm30_sm_cta_launched,
1070 &sm30_threads_launched,
1071 &sm30_uncached_gld_transactions,
1072 &sm30_warps_launched,
1073 };
1074
1075 /* ==== Compute capability 3.5 (GK110/GK208) ==== */
1076 static const struct nvc0_hw_sm_query_cfg
1077 sm35_atom_cas_count =
1078 {
1079 .type = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
1080 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000014),
1081 .num_counters = 1,
1082 .norm = { 1, 1 },
1083 };
1084
1085 static const struct nvc0_hw_sm_query_cfg
1086 sm35_atom_count =
1087 {
1088 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
1089 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010),
1090 .num_counters = 1,
1091 .norm = { 1, 1 },
1092 };
1093
1094 static const struct nvc0_hw_sm_query_cfg
1095 sm35_gred_count =
1096 {
1097 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
1098 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000018),
1099 .num_counters = 1,
1100 .norm = { 1, 1 },
1101 };
1102
1103 static const struct nvc0_hw_sm_query_cfg
1104 sm35_not_pred_off_inst_executed =
1105 {
1106 .type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
1107 .ctr[0] = _CA(0x003f, B6, 0x14, 0x29062080),
1108 .num_counters = 1,
1109 .norm = { 1, 1 },
1110 };
1111
1112 static const struct nvc0_hw_sm_query_cfg
1113 sm35_shared_ld_replay =
1114 {
1115 .type = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
1116 .ctr[0] = _CB(0xaaaa, LOGOP, 0x13, 0x00000018),
1117 .ctr[1] = _CB(0x8888, LOGOP, 0x08, 0x00000151),
1118 .num_counters = 2,
1119 .norm = { 1, 1 },
1120 };
1121
1122 static const struct nvc0_hw_sm_query_cfg
1123 sm35_shared_st_replay =
1124 {
1125 .type = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
1126 .ctr[0] = _CB(0xaaaa, LOGOP, 0x13, 0x00000018),
1127 .ctr[1] = _CB(0x8888, LOGOP, 0x08, 0x000001d1),
1128 .num_counters = 2,
1129 .norm = { 1, 1 },
1130 };
1131
1132 static const struct nvc0_hw_sm_query_cfg
1133 sm35_th_inst_executed =
1134 {
1135 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
1136 .ctr[0] = _CA(0x003f, B6, 0x11, 0x29062080),
1137 .num_counters = 1,
1138 .norm = { 1, 1 },
1139 };
1140
1141 static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =
1142 {
1143 &sm30_active_cycles,
1144 &sm30_active_warps,
1145 &sm35_atom_cas_count,
1146 &sm35_atom_count,
1147 &sm30_gld_request,
1148 &sm30_gld_mem_div_replay,
1149 &sm30_gst_transactions,
1150 &sm30_gst_mem_div_replay,
1151 &sm35_gred_count,
1152 &sm30_gst_request,
1153 &sm30_inst_executed,
1154 &sm30_inst_issued1,
1155 &sm30_inst_issued2,
1156 &sm30_l1_gld_hit,
1157 &sm30_l1_gld_miss,
1158 &sm30_l1_gld_transactions,
1159 &sm30_l1_gst_transactions,
1160 &sm30_l1_local_ld_hit,
1161 &sm30_l1_local_ld_miss,
1162 &sm30_l1_local_st_hit,
1163 &sm30_l1_local_st_miss,
1164 &sm30_l1_shared_ld_transactions,
1165 &sm30_l1_shared_st_transactions,
1166 &sm30_local_ld,
1167 &sm30_local_ld_transactions,
1168 &sm30_local_st,
1169 &sm30_local_st_transactions,
1170 &sm35_not_pred_off_inst_executed,
1171 &sm30_prof_trigger_0,
1172 &sm30_prof_trigger_1,
1173 &sm30_prof_trigger_2,
1174 &sm30_prof_trigger_3,
1175 &sm30_prof_trigger_4,
1176 &sm30_prof_trigger_5,
1177 &sm30_prof_trigger_6,
1178 &sm30_prof_trigger_7,
1179 &sm30_shared_ld,
1180 &sm35_shared_ld_replay,
1181 &sm30_shared_st,
1182 &sm35_shared_st_replay,
1183 &sm30_sm_cta_launched,
1184 &sm35_th_inst_executed,
1185 &sm30_threads_launched,
1186 &sm30_uncached_gld_transactions,
1187 &sm30_warps_launched,
1188 };
1189
1190 /* ==== Compute capability 5.0 (GM107/GM108) ==== */
1191 static const struct nvc0_hw_sm_query_cfg
1192 sm50_active_ctas =
1193 {
1194 .type = NVC0_HW_SM_QUERY_ACTIVE_CTAS,
1195 .ctr[0] = _CB(0x003f, B6, 0x01, 0x29062080),
1196 .num_counters = 1,
1197 .norm = { 1, 1 },
1198 };
1199
1200 static const struct nvc0_hw_sm_query_cfg
1201 sm50_active_cycles =
1202 {
1203 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
1204 .ctr[0] = _CB(0x0001, B6, 0x00, 0x00000004),
1205 .num_counters = 1,
1206 .norm = { 1, 1 },
1207 };
1208
1209 static const struct nvc0_hw_sm_query_cfg
1210 sm50_active_warps =
1211 {
1212 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
1213 .ctr[0] = _CB(0x003f, B6, 0x00, 0x398a4188),
1214 .num_counters = 1,
1215 .norm = { 1, 1 },
1216 };
1217
1218 static const struct nvc0_hw_sm_query_cfg
1219 sm50_atom_count =
1220 {
1221 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
1222 .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000004),
1223 .num_counters = 1,
1224 .norm = { 1, 1 },
1225 };
1226
1227 static const struct nvc0_hw_sm_query_cfg
1228 sm50_branch =
1229 {
1230 .type = NVC0_HW_SM_QUERY_BRANCH,
1231 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010),
1232 .num_counters = 1,
1233 .norm = { 1, 1 },
1234 };
1235
1236 static const struct nvc0_hw_sm_query_cfg
1237 sm50_divergent_branch =
1238 {
1239 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
1240 .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000004),
1241 .num_counters = 1,
1242 .norm = { 1, 1 },
1243 };
1244
1245 static const struct nvc0_hw_sm_query_cfg
1246 sm50_global_atom_cas =
1247 {
1248 .type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
1249 .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000000),
1250 .num_counters = 1,
1251 .norm = { 1, 1 },
1252 };
1253
1254 static const struct nvc0_hw_sm_query_cfg
1255 sm50_global_ld =
1256 {
1257 .type = NVC0_HW_SM_QUERY_GLOBAL_LD,
1258 .ctr[0] = _CA(0x0001, B6, 0x14, 0x0000000c),
1259 .num_counters = 1,
1260 .norm = { 1, 1 },
1261 };
1262
1263 static const struct nvc0_hw_sm_query_cfg
1264 sm50_global_st =
1265 {
1266 .type = NVC0_HW_SM_QUERY_GLOBAL_ST,
1267 .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000010),
1268 .num_counters = 1,
1269 .norm = { 1, 1 },
1270 };
1271
1272 static const struct nvc0_hw_sm_query_cfg
1273 sm50_gred_count =
1274 {
1275 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
1276 .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000008),
1277 .num_counters = 1,
1278 .norm = { 1, 1 },
1279 };
1280
1281 static const struct nvc0_hw_sm_query_cfg
1282 sm50_inst_executed =
1283 {
1284 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
1285 .ctr[0] = _CA(0x0003, B6, 0x02, 0x00000398),
1286 .num_counters = 1,
1287 .norm = { 1, 1 },
1288 };
1289
1290 static const struct nvc0_hw_sm_query_cfg
1291 sm50_inst_issued0 =
1292 {
1293 .type = NVC0_HW_SM_QUERY_INST_ISSUED0,
1294 .ctr[0] = _CA(0x0001, B6, 0x02, 0x0000000c),
1295 .num_counters = 1,
1296 .norm = { 1, 1 },
1297 };
1298
1299 static const struct nvc0_hw_sm_query_cfg
1300 sm50_inst_issued1 =
1301 {
1302 .type = NVC0_HW_SM_QUERY_INST_ISSUED1,
1303 .ctr[0] = _CA(0x0001, B6, 0x02, 0x00000010),
1304 .num_counters = 1,
1305 .norm = { 1, 1 },
1306 };
1307
1308 static const struct nvc0_hw_sm_query_cfg
1309 sm50_inst_issued2 =
1310 {
1311 .type = NVC0_HW_SM_QUERY_INST_ISSUED2,
1312 .ctr[0] = _CA(0x0001, B6, 0x02, 0x00000014),
1313 .num_counters = 1,
1314 .norm = { 1, 1 },
1315 };
1316
1317 static const struct nvc0_hw_sm_query_cfg
1318 sm50_local_ld =
1319 {
1320 .type = NVC0_HW_SM_QUERY_LOCAL_LD,
1321 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000004),
1322 .num_counters = 1,
1323 .norm = { 1, 1 },
1324 };
1325
1326 static const struct nvc0_hw_sm_query_cfg
1327 sm50_local_st =
1328 {
1329 .type = NVC0_HW_SM_QUERY_LOCAL_ST,
1330 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000000),
1331 .num_counters = 1,
1332 .norm = { 1, 1 },
1333 };
1334
1335 static const struct nvc0_hw_sm_query_cfg
1336 sm50_not_pred_off_inst_executed =
1337 {
1338 .type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
1339 .ctr[0] = _CA(0x003f, B6, 0x05, 0x29062080),
1340 .num_counters = 1,
1341 .norm = { 1, 1 },
1342 };
1343
1344 static const struct nvc0_hw_sm_query_cfg
1345 sm50_prof_trigger_0 =
1346 {
1347 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
1348 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000000),
1349 .num_counters = 1,
1350 .norm = { 1, 1 },
1351 };
1352
1353 static const struct nvc0_hw_sm_query_cfg
1354 sm50_prof_trigger_1 =
1355 {
1356 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
1357 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000004),
1358 .num_counters = 1,
1359 .norm = { 1, 1 },
1360 };
1361
1362 static const struct nvc0_hw_sm_query_cfg
1363 sm50_prof_trigger_2 =
1364 {
1365 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
1366 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000008),
1367 .num_counters = 1,
1368 .norm = { 1, 1 },
1369 };
1370
1371 static const struct nvc0_hw_sm_query_cfg
1372 sm50_prof_trigger_3 =
1373 {
1374 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
1375 .ctr[0] = _CA(0x0001, B6, 0x00, 0x0000000c),
1376 .num_counters = 1,
1377 .norm = { 1, 1 },
1378 };
1379
1380 static const struct nvc0_hw_sm_query_cfg
1381 sm50_prof_trigger_4 =
1382 {
1383 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
1384 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000010),
1385 .num_counters = 1,
1386 .norm = { 1, 1 },
1387 };
1388
1389 static const struct nvc0_hw_sm_query_cfg
1390 sm50_prof_trigger_5 =
1391 {
1392 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
1393 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000014),
1394 .num_counters = 1,
1395 .norm = { 1, 1 },
1396 };
1397
1398 static const struct nvc0_hw_sm_query_cfg
1399 sm50_prof_trigger_6 =
1400 {
1401 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
1402 .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000018),
1403 .num_counters = 1,
1404 .norm = { 1, 1 },
1405 };
1406
1407 static const struct nvc0_hw_sm_query_cfg
1408 sm50_prof_trigger_7 =
1409 {
1410 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
1411 .ctr[0] = _CA(0x0001, B6, 0x00, 0x0000001c),
1412 .num_counters = 1,
1413 .norm = { 1, 1 },
1414 };
1415
1416 static const struct nvc0_hw_sm_query_cfg
1417 sm50_shared_atom =
1418 {
1419 .type = NVC0_HW_SM_QUERY_SHARED_ATOM,
1420 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000014),
1421 .num_counters = 1,
1422 .norm = { 1, 1 },
1423 };
1424
1425 static const struct nvc0_hw_sm_query_cfg
1426 sm50_shared_atom_cas =
1427 {
1428 .type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
1429 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000010),
1430 .num_counters = 1,
1431 .norm = { 1, 1 },
1432 };
1433
1434 static const struct nvc0_hw_sm_query_cfg
1435 sm50_shared_ld =
1436 {
1437 .type = NVC0_HW_SM_QUERY_SHARED_LD,
1438 .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000008),
1439 .num_counters = 1,
1440 .norm = { 1, 1 },
1441 };
1442
1443 static const struct nvc0_hw_sm_query_cfg
1444 sm50_shared_ld_bank_conflict =
1445 {
1446 .type = NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,
1447 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000),
1448 .num_counters = 1,
1449 .norm = { 1, 1 },
1450 };
1451
1452 static const struct nvc0_hw_sm_query_cfg
1453 sm50_shared_ld_transactions =
1454 {
1455 .type = NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,
1456 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008),
1457 .num_counters = 1,
1458 .norm = { 1, 1 },
1459 };
1460
1461 static const struct nvc0_hw_sm_query_cfg
1462 sm50_shared_st =
1463 {
1464 .type = NVC0_HW_SM_QUERY_SHARED_ST,
1465 .ctr[0] = _CA(0x0001, B6, 0x13, 0x0000000c),
1466 .num_counters = 1,
1467 .norm = { 1, 1 },
1468 };
1469
1470 static const struct nvc0_hw_sm_query_cfg
1471 sm50_shared_st_bank_conflict =
1472 {
1473 .type = NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,
1474 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004),
1475 .num_counters = 1,
1476 .norm = { 1, 1 },
1477 };
1478
1479 static const struct nvc0_hw_sm_query_cfg
1480 sm50_shared_st_transactions =
1481 {
1482 .type = NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,
1483 .ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c),
1484 .num_counters = 1,
1485 .norm = { 1, 1 },
1486 };
1487
1488 static const struct nvc0_hw_sm_query_cfg
1489 sm50_sm_cta_launched =
1490 {
1491 .type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
1492 .ctr[0] = _CB(0x0001, B6, 0x01, 0x00000018),
1493 .num_counters = 1,
1494 .norm = { 1, 1 },
1495 };
1496
1497 static const struct nvc0_hw_sm_query_cfg
1498 sm50_th_inst_executed =
1499 {
1500 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
1501 .ctr[0] = _CA(0x003f, B6, 0x04, 0x29062080),
1502 .num_counters = 1,
1503 .norm = { 1, 1 },
1504 };
1505
1506 static const struct nvc0_hw_sm_query_cfg
1507 sm50_warps_launched =
1508 {
1509 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
1510 .ctr[0] = _CA(0x0001, B6, 0x02, 0x00000008),
1511 .num_counters = 1,
1512 .norm = { 1, 1 },
1513 };
1514
1515 static const struct nvc0_hw_sm_query_cfg *sm50_hw_sm_queries[] =
1516 {
1517 &sm50_active_ctas,
1518 &sm50_active_cycles,
1519 &sm50_active_warps,
1520 &sm50_atom_count,
1521 &sm50_branch,
1522 &sm50_divergent_branch,
1523 &sm50_global_atom_cas,
1524 &sm50_global_ld,
1525 &sm50_global_st,
1526 &sm50_gred_count,
1527 &sm50_inst_executed,
1528 &sm50_inst_issued0,
1529 &sm50_inst_issued1,
1530 &sm50_inst_issued2,
1531 &sm50_local_ld,
1532 &sm50_local_st,
1533 &sm50_not_pred_off_inst_executed,
1534 &sm50_prof_trigger_0,
1535 &sm50_prof_trigger_1,
1536 &sm50_prof_trigger_2,
1537 &sm50_prof_trigger_3,
1538 &sm50_prof_trigger_4,
1539 &sm50_prof_trigger_5,
1540 &sm50_prof_trigger_6,
1541 &sm50_prof_trigger_7,
1542 &sm50_shared_atom,
1543 &sm50_shared_atom_cas,
1544 &sm50_shared_ld,
1545 &sm50_shared_ld_bank_conflict,
1546 &sm50_shared_ld_transactions,
1547 &sm50_shared_st,
1548 &sm50_shared_st_bank_conflict,
1549 &sm50_shared_st_transactions,
1550 &sm50_sm_cta_launched,
1551 &sm50_th_inst_executed,
1552 &sm50_warps_launched,
1553 };
1554
1555 /* ==== Compute capability 5.2 (GM200/GM204/GM206) ==== */
1556 static const struct nvc0_hw_sm_query_cfg
1557 sm52_atom_count =
1558 {
1559 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
1560 .ctr[0] = _CA(0x0001, B6, 0x0a, 0x0000001c),
1561 .num_counters = 1,
1562 .norm = { 1, 1 },
1563 };
1564
1565 static const struct nvc0_hw_sm_query_cfg
1566 sm52_global_atom_cas =
1567 {
1568 .type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
1569 .ctr[0] = _CA(0x0001, B6, 0x0a, 0x00000018),
1570 .num_counters = 1,
1571 .norm = { 1, 1 },
1572 };
1573
1574 static const struct nvc0_hw_sm_query_cfg
1575 sm52_global_ld =
1576 {
1577 .type = NVC0_HW_SM_QUERY_GLOBAL_LD,
1578 .ctr[0] = _CA(0x0001, B6, 0x0b, 0x00000018),
1579 .num_counters = 1,
1580 .norm = { 1, 1 },
1581 };
1582
1583 static const struct nvc0_hw_sm_query_cfg
1584 sm52_global_st =
1585 {
1586 .type = NVC0_HW_SM_QUERY_GLOBAL_ST,
1587 .ctr[0] = _CA(0x0001, B6, 0x0b, 0x0000001c),
1588 .num_counters = 1,
1589 .norm = { 1, 1 },
1590 };
1591
1592 static const struct nvc0_hw_sm_query_cfg
1593 sm52_gred_count =
1594 {
1595 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
1596 .ctr[0] = _CA(0x0001, B6, 0x0f, 0x00000018),
1597 .num_counters = 1,
1598 .norm = { 1, 1 },
1599 };
1600
1601 static const struct nvc0_hw_sm_query_cfg
1602 sm52_inst_executed =
1603 {
1604 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
1605 .ctr[0] = _CA(0x0003, B6, 0x03, 0x0000020c),
1606 .num_counters = 1,
1607 .norm = { 1, 1 },
1608 };
1609
1610 static const struct nvc0_hw_sm_query_cfg
1611 sm52_inst_issued0 =
1612 {
1613 .type = NVC0_HW_SM_QUERY_INST_ISSUED0,
1614 .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000000),
1615 .num_counters = 1,
1616 .norm = { 1, 1 },
1617 };
1618
1619 static const struct nvc0_hw_sm_query_cfg
1620 sm52_inst_issued1 =
1621 {
1622 .type = NVC0_HW_SM_QUERY_INST_ISSUED1,
1623 .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004),
1624 .num_counters = 1,
1625 .norm = { 1, 1 },
1626 };
1627
1628 static const struct nvc0_hw_sm_query_cfg
1629 sm52_inst_issued2 =
1630 {
1631 .type = NVC0_HW_SM_QUERY_INST_ISSUED2,
1632 .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000008),
1633 .num_counters = 1,
1634 .norm = { 1, 1 },
1635 };
1636
1637 static const struct nvc0_hw_sm_query_cfg
1638 sm52_local_ld =
1639 {
1640 .type = NVC0_HW_SM_QUERY_LOCAL_LD,
1641 .ctr[0] = _CA(0x0001, B6, 0x06, 0x0000001c),
1642 .num_counters = 1,
1643 .norm = { 1, 1 },
1644 };
1645
1646 static const struct nvc0_hw_sm_query_cfg
1647 sm52_local_st =
1648 {
1649 .type = NVC0_HW_SM_QUERY_LOCAL_ST,
1650 .ctr[0] = _CA(0x0001, B6, 0x06, 0x00000018),
1651 .num_counters = 1,
1652 .norm = { 1, 1 },
1653 };
1654
1655 static const struct nvc0_hw_sm_query_cfg
1656 sm52_shared_atom =
1657 {
1658 .type = NVC0_HW_SM_QUERY_SHARED_ATOM,
1659 .ctr[0] = _CA(0x0001, B6, 0x08, 0x0000001c),
1660 .num_counters = 1,
1661 .norm = { 1, 1 },
1662 };
1663
1664 static const struct nvc0_hw_sm_query_cfg
1665 sm52_shared_atom_cas =
1666 {
1667 .type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
1668 .ctr[0] = _CA(0x0001, B6, 0x08, 0x00000018),
1669 .num_counters = 1,
1670 .norm = { 1, 1 },
1671 };
1672
1673 static const struct nvc0_hw_sm_query_cfg
1674 sm52_shared_ld =
1675 {
1676 .type = NVC0_HW_SM_QUERY_SHARED_LD,
1677 .ctr[0] = _CA(0x0001, B6, 0x07, 0x00000018),
1678 .num_counters = 1,
1679 .norm = { 1, 1 },
1680 };
1681
1682 static const struct nvc0_hw_sm_query_cfg
1683 sm52_shared_st =
1684 {
1685 .type = NVC0_HW_SM_QUERY_SHARED_ST,
1686 .ctr[0] = _CA(0x0001, B6, 0x07, 0x0000001c),
1687 .num_counters = 1,
1688 .norm = { 1, 1 },
1689 };
1690
1691 static const struct nvc0_hw_sm_query_cfg
1692 sm52_warps_launched =
1693 {
1694 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
1695 .ctr[0] = _CA(0x0001, B6, 0x02, 0x0000001c),
1696 .num_counters = 1,
1697 .norm = { 1, 1 },
1698 };
1699
1700 static const struct nvc0_hw_sm_query_cfg *sm52_hw_sm_queries[] =
1701 {
1702 &sm50_active_ctas,
1703 &sm50_active_cycles,
1704 &sm50_active_warps,
1705 &sm52_atom_count,
1706 &sm50_branch,
1707 &sm50_divergent_branch,
1708 &sm52_global_atom_cas,
1709 &sm52_global_ld,
1710 &sm52_global_st,
1711 &sm52_gred_count,
1712 &sm52_inst_executed,
1713 &sm52_inst_issued0,
1714 &sm52_inst_issued1,
1715 &sm52_inst_issued2,
1716 &sm52_local_ld,
1717 &sm52_local_st,
1718 &sm50_not_pred_off_inst_executed,
1719 &sm50_prof_trigger_0,
1720 &sm50_prof_trigger_1,
1721 &sm50_prof_trigger_2,
1722 &sm50_prof_trigger_3,
1723 &sm50_prof_trigger_4,
1724 &sm50_prof_trigger_5,
1725 &sm50_prof_trigger_6,
1726 &sm50_prof_trigger_7,
1727 &sm52_shared_atom,
1728 &sm52_shared_atom_cas,
1729 &sm52_shared_ld,
1730 &sm50_shared_ld_bank_conflict,
1731 &sm50_shared_ld_transactions,
1732 &sm52_shared_st,
1733 &sm50_shared_st_bank_conflict,
1734 &sm50_shared_st_transactions,
1735 &sm50_sm_cta_launched,
1736 &sm50_th_inst_executed,
1737 &sm52_warps_launched,
1738 };
1739
1740 #undef _Q
1741 #undef _CA
1742 #undef _CB
1743
1744 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
1745 /* NOTES:
1746 * - MP counters on GF100/GF110 (compute capability 2.0) are buggy
1747 * because there is a context-switch problem that we need to fix.
1748 * Results might be wrong sometimes, be careful!
1749 */
1750 static const uint64_t nvc0_read_hw_sm_counters_code[] =
1751 {
1752 /* mov b32 $r8 $tidx
1753 * mov b32 $r9 $physid
1754 * mov b32 $r0 $pm0
1755 * mov b32 $r1 $pm1
1756 * mov b32 $r2 $pm2
1757 * mov b32 $r3 $pm3
1758 * mov b32 $r4 $pm4
1759 * mov b32 $r5 $pm5
1760 * mov b32 $r6 $pm6
1761 * mov b32 $r7 $pm7
1762 * set $p0 0x1 eq u32 $r8 0x0
1763 * mov b32 $r10 c15[0x6a0]
1764 * mov b32 $r11 c15[0x6a4]
1765 * ext u32 $r8 $r9 0x414
1766 * (not $p0) exit
1767 * mul $r8 u32 $r8 u32 48
1768 * add b32 $r10 $c $r10 $r8
1769 * add b32 $r11 $r11 0x0 $c
1770 * mov b32 $r8 c15[0x6a8]
1771 * st b128 wt g[$r10d+0x00] $r0q
1772 * st b128 wt g[$r10d+0x10] $r4q
1773 * st b32 wt g[$r10d+0x20] $r8
1774 * exit */
1775 0x2c00000084021c04ULL,
1776 0x2c0000000c025c04ULL,
1777 0x2c00000010001c04ULL,
1778 0x2c00000014005c04ULL,
1779 0x2c00000018009c04ULL,
1780 0x2c0000001c00dc04ULL,
1781 0x2c00000020011c04ULL,
1782 0x2c00000024015c04ULL,
1783 0x2c00000028019c04ULL,
1784 0x2c0000002c01dc04ULL,
1785 0x190e0000fc81dc03ULL,
1786 0x28007c1a80029de4ULL,
1787 0x28007c1a9002dde4ULL,
1788 0x7000c01050921c03ULL,
1789 0x80000000000021e7ULL,
1790 0x10000000c0821c02ULL,
1791 0x4801000020a29c03ULL,
1792 0x0800000000b2dc42ULL,
1793 0x28007c1aa0021de4ULL,
1794 0x9400000000a01fc5ULL,
1795 0x9400000040a11fc5ULL,
1796 0x9400000080a21f85ULL,
1797 0x8000000000001de7ULL
1798 };
1799
1800 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
1801
1802 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
1803 static const struct nvc0_hw_sm_query_cfg
1804 sm20_active_cycles =
1805 {
1806 .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
1807 .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
1808 .num_counters = 1,
1809 .norm = { 1, 1 },
1810 };
1811
1812 static const struct nvc0_hw_sm_query_cfg
1813 sm20_active_warps =
1814 {
1815 .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
1816 .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
1817 .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
1818 .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
1819 .ctr[3] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000040),
1820 .ctr[4] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000050),
1821 .ctr[5] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000060),
1822 .num_counters = 6,
1823 .norm = { 1, 1 },
1824 };
1825
1826 static const struct nvc0_hw_sm_query_cfg
1827 sm20_atom_count =
1828 {
1829 .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
1830 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
1831 .num_counters = 1,
1832 .norm = { 1, 1 },
1833 };
1834
1835 static const struct nvc0_hw_sm_query_cfg
1836 sm20_branch =
1837 {
1838 .type = NVC0_HW_SM_QUERY_BRANCH,
1839 .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
1840 .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
1841 .num_counters = 2,
1842 .norm = { 1, 1 },
1843 };
1844
1845 static const struct nvc0_hw_sm_query_cfg
1846 sm20_divergent_branch =
1847 {
1848 .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
1849 .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
1850 .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
1851 .num_counters = 2,
1852 .norm = { 1, 1 },
1853 };
1854
1855 static const struct nvc0_hw_sm_query_cfg
1856 sm20_gld_request =
1857 {
1858 .type = NVC0_HW_SM_QUERY_GLD_REQUEST,
1859 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
1860 .num_counters = 1,
1861 .norm = { 1, 1 },
1862 };
1863
1864 static const struct nvc0_hw_sm_query_cfg
1865 sm20_gred_count =
1866 {
1867 .type = NVC0_HW_SM_QUERY_GRED_COUNT,
1868 .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
1869 .num_counters = 1,
1870 .norm = { 1, 1 },
1871 };
1872
1873 static const struct nvc0_hw_sm_query_cfg
1874 sm20_gst_request =
1875 {
1876 .type = NVC0_HW_SM_QUERY_GST_REQUEST,
1877 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
1878 .num_counters = 1,
1879 .norm = { 1, 1 },
1880 };
1881
1882 static const struct nvc0_hw_sm_query_cfg
1883 sm20_inst_executed =
1884 {
1885 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
1886 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
1887 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
1888 .num_counters = 2,
1889 .norm = { 1, 1 },
1890 };
1891
1892 static const struct nvc0_hw_sm_query_cfg
1893 sm20_inst_issued =
1894 {
1895 .type = NVC0_HW_SM_QUERY_INST_ISSUED,
1896 .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
1897 .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
1898 .num_counters = 2,
1899 .norm = { 1, 1 },
1900 };
1901
1902 static const struct nvc0_hw_sm_query_cfg
1903 sm20_local_ld =
1904 {
1905 .type = NVC0_HW_SM_QUERY_LOCAL_LD,
1906 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
1907 .num_counters = 1,
1908 .norm = { 1, 1 },
1909 };
1910
1911 static const struct nvc0_hw_sm_query_cfg
1912 sm20_local_st =
1913 {
1914 .type = NVC0_HW_SM_QUERY_LOCAL_ST,
1915 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
1916 .num_counters = 1,
1917 .norm = { 1, 1 },
1918 };
1919
1920 static const struct nvc0_hw_sm_query_cfg
1921 sm20_prof_trigger_0 =
1922 {
1923 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
1924 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
1925 .num_counters = 1,
1926 .norm = { 1, 1 },
1927 };
1928
1929 static const struct nvc0_hw_sm_query_cfg
1930 sm20_prof_trigger_1 =
1931 {
1932 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
1933 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
1934 .num_counters = 1,
1935 .norm = { 1, 1 },
1936 };
1937
1938 static const struct nvc0_hw_sm_query_cfg
1939 sm20_prof_trigger_2 =
1940 {
1941 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
1942 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
1943 .num_counters = 1,
1944 .norm = { 1, 1 },
1945 };
1946
1947 static const struct nvc0_hw_sm_query_cfg
1948 sm20_prof_trigger_3 =
1949 {
1950 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
1951 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
1952 .num_counters = 1,
1953 .norm = { 1, 1 },
1954 };
1955
1956 static const struct nvc0_hw_sm_query_cfg
1957 sm20_prof_trigger_4 =
1958 {
1959 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
1960 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
1961 .num_counters = 1,
1962 .norm = { 1, 1 },
1963 };
1964
1965 static const struct nvc0_hw_sm_query_cfg
1966 sm20_prof_trigger_5 =
1967 {
1968 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
1969 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
1970 .num_counters = 1,
1971 .norm = { 1, 1 },
1972 };
1973
1974 static const struct nvc0_hw_sm_query_cfg
1975 sm20_prof_trigger_6 =
1976 {
1977 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
1978 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
1979 .num_counters = 1,
1980 .norm = { 1, 1 },
1981 };
1982
1983 static const struct nvc0_hw_sm_query_cfg
1984 sm20_prof_trigger_7 =
1985 {
1986 .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
1987 .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
1988 .num_counters = 1,
1989 .norm = { 1, 1 },
1990 };
1991
1992 static const struct nvc0_hw_sm_query_cfg
1993 sm20_shared_ld =
1994 {
1995 .type = NVC0_HW_SM_QUERY_SHARED_LD,
1996 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
1997 .num_counters = 1,
1998 .norm = { 1, 1 },
1999 };
2000
2001 static const struct nvc0_hw_sm_query_cfg
2002 sm20_shared_st =
2003 {
2004 .type = NVC0_HW_SM_QUERY_SHARED_ST,
2005 .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
2006 .num_counters = 1,
2007 .norm = { 1, 1 },
2008 };
2009
2010 static const struct nvc0_hw_sm_query_cfg
2011 sm20_threads_launched =
2012 {
2013 .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
2014 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
2015 .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
2016 .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
2017 .ctr[3] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000040),
2018 .ctr[4] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000050),
2019 .ctr[5] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000060),
2020 .num_counters = 6,
2021 .norm = { 1, 1 },
2022 };
2023
2024 static const struct nvc0_hw_sm_query_cfg
2025 sm20_th_inst_executed_0 =
2026 {
2027 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
2028 .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
2029 .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
2030 .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
2031 .ctr[3] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000030),
2032 .ctr[4] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000040),
2033 .ctr[5] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000050),
2034 .num_counters = 6,
2035 .norm = { 1, 1 },
2036 };
2037
2038 static const struct nvc0_hw_sm_query_cfg
2039 sm20_th_inst_executed_1 =
2040 {
2041 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
2042 .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
2043 .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
2044 .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
2045 .ctr[3] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000030),
2046 .ctr[4] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000040),
2047 .ctr[5] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000050),
2048 .num_counters = 6,
2049 .norm = { 1, 1 },
2050 };
2051
2052 static const struct nvc0_hw_sm_query_cfg
2053 sm20_warps_launched =
2054 {
2055 .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
2056 .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
2057 .num_counters = 1,
2058 .norm = { 1, 1 },
2059 };
2060
2061 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
2062 {
2063 &sm20_active_cycles,
2064 &sm20_active_warps,
2065 &sm20_atom_count,
2066 &sm20_branch,
2067 &sm20_divergent_branch,
2068 &sm20_gld_request,
2069 &sm20_gred_count,
2070 &sm20_gst_request,
2071 &sm20_inst_executed,
2072 &sm20_inst_issued,
2073 &sm20_local_ld,
2074 &sm20_local_st,
2075 &sm20_prof_trigger_0,
2076 &sm20_prof_trigger_1,
2077 &sm20_prof_trigger_2,
2078 &sm20_prof_trigger_3,
2079 &sm20_prof_trigger_4,
2080 &sm20_prof_trigger_5,
2081 &sm20_prof_trigger_6,
2082 &sm20_prof_trigger_7,
2083 &sm20_shared_ld,
2084 &sm20_shared_st,
2085 &sm20_threads_launched,
2086 &sm20_th_inst_executed_0,
2087 &sm20_th_inst_executed_1,
2088 &sm20_warps_launched,
2089 };
2090
2091 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
2092 static const struct nvc0_hw_sm_query_cfg
2093 sm21_inst_executed =
2094 {
2095 .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
2096 .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
2097 .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
2098 .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
2099 .num_counters = 3,
2100 .norm = { 1, 1 },
2101 };
2102
2103 static const struct nvc0_hw_sm_query_cfg
2104 sm21_inst_issued1_0 =
2105 {
2106 .type = NVC0_HW_SM_QUERY_INST_ISSUED1_0,
2107 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
2108 .num_counters = 1,
2109 .norm = { 1, 1 },
2110 };
2111
2112 static const struct nvc0_hw_sm_query_cfg
2113 sm21_inst_issued1_1 =
2114 {
2115 .type = NVC0_HW_SM_QUERY_INST_ISSUED1_1,
2116 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
2117 .num_counters = 1,
2118 .norm = { 1, 1 },
2119 };
2120
2121 static const struct nvc0_hw_sm_query_cfg
2122 sm21_inst_issued2_0 =
2123 {
2124 .type = NVC0_HW_SM_QUERY_INST_ISSUED2_0,
2125 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
2126 .num_counters = 1,
2127 .norm = { 1, 1 },
2128 };
2129
2130 static const struct nvc0_hw_sm_query_cfg
2131 sm21_inst_issued2_1 =
2132 {
2133 .type = NVC0_HW_SM_QUERY_INST_ISSUED2_1,
2134 .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
2135 .num_counters = 1,
2136 .norm = { 1, 1 },
2137 };
2138
2139 static const struct nvc0_hw_sm_query_cfg
2140 sm21_th_inst_executed_0 =
2141 {
2142 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
2143 .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
2144 .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
2145 .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
2146 .ctr[3] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000030),
2147 .ctr[4] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000040),
2148 .ctr[5] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000050),
2149 .num_counters = 6,
2150 .norm = { 1, 1 },
2151 };
2152
2153 static const struct nvc0_hw_sm_query_cfg
2154 sm21_th_inst_executed_1 =
2155 {
2156 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
2157 .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
2158 .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
2159 .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
2160 .ctr[3] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000030),
2161 .ctr[4] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000040),
2162 .ctr[5] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000050),
2163 .num_counters = 6,
2164 .norm = { 1, 1 },
2165 };
2166
2167 static const struct nvc0_hw_sm_query_cfg
2168 sm21_th_inst_executed_2 =
2169 {
2170 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
2171 .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
2172 .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
2173 .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
2174 .ctr[3] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000030),
2175 .ctr[4] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000040),
2176 .ctr[5] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000050),
2177 .num_counters = 6,
2178 .norm = { 1, 1 },
2179 };
2180
2181 static const struct nvc0_hw_sm_query_cfg
2182 sm21_th_inst_executed_3 =
2183 {
2184 .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
2185 .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
2186 .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
2187 .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
2188 .ctr[3] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000030),
2189 .ctr[4] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000040),
2190 .ctr[5] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000050),
2191 .num_counters = 6,
2192 .norm = { 1, 1 },
2193 };
2194
2195 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
2196 {
2197 &sm20_active_cycles,
2198 &sm20_active_warps,
2199 &sm20_atom_count,
2200 &sm20_branch,
2201 &sm20_divergent_branch,
2202 &sm20_gld_request,
2203 &sm20_gred_count,
2204 &sm20_gst_request,
2205 &sm21_inst_executed,
2206 &sm21_inst_issued1_0,
2207 &sm21_inst_issued1_1,
2208 &sm21_inst_issued2_0,
2209 &sm21_inst_issued2_1,
2210 &sm20_local_ld,
2211 &sm20_local_st,
2212 &sm20_prof_trigger_0,
2213 &sm20_prof_trigger_1,
2214 &sm20_prof_trigger_2,
2215 &sm20_prof_trigger_3,
2216 &sm20_prof_trigger_4,
2217 &sm20_prof_trigger_5,
2218 &sm20_prof_trigger_6,
2219 &sm20_prof_trigger_7,
2220 &sm20_shared_ld,
2221 &sm20_shared_st,
2222 &sm20_threads_launched,
2223 &sm21_th_inst_executed_0,
2224 &sm21_th_inst_executed_1,
2225 &sm21_th_inst_executed_2,
2226 &sm21_th_inst_executed_3,
2227 &sm20_warps_launched,
2228 };
2229
2230 #undef _C
2231
2232 static inline const struct nvc0_hw_sm_query_cfg **
nvc0_hw_sm_get_queries(struct nvc0_screen * screen)2233 nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
2234 {
2235 struct nouveau_device *dev = screen->base.device;
2236
2237 switch (screen->base.class_3d) {
2238 case GM200_3D_CLASS:
2239 return sm52_hw_sm_queries;
2240 case GM107_3D_CLASS:
2241 return sm50_hw_sm_queries;
2242 case NVF0_3D_CLASS:
2243 return sm35_hw_sm_queries;
2244 case NVE4_3D_CLASS:
2245 return sm30_hw_sm_queries;
2246 case NVC0_3D_CLASS:
2247 case NVC1_3D_CLASS:
2248 case NVC8_3D_CLASS:
2249 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
2250 return sm20_hw_sm_queries;
2251 return sm21_hw_sm_queries;
2252 }
2253 assert(0);
2254 return NULL;
2255 }
2256
2257 unsigned
nvc0_hw_sm_get_num_queries(struct nvc0_screen * screen)2258 nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)
2259 {
2260 struct nouveau_device *dev = screen->base.device;
2261
2262 switch (screen->base.class_3d) {
2263 case GM200_3D_CLASS:
2264 return ARRAY_SIZE(sm52_hw_sm_queries);
2265 case GM107_3D_CLASS:
2266 return ARRAY_SIZE(sm50_hw_sm_queries);
2267 case NVF0_3D_CLASS:
2268 return ARRAY_SIZE(sm35_hw_sm_queries);
2269 case NVE4_3D_CLASS:
2270 return ARRAY_SIZE(sm30_hw_sm_queries);
2271 case NVC0_3D_CLASS:
2272 case NVC1_3D_CLASS:
2273 case NVC8_3D_CLASS:
2274 if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
2275 return ARRAY_SIZE(sm20_hw_sm_queries);
2276 return ARRAY_SIZE(sm21_hw_sm_queries);
2277 }
2278 return 0;
2279 }
2280
2281 static const struct nvc0_hw_sm_query_cfg *
nvc0_hw_sm_query_get_cfg(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)2282 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2283 {
2284 const struct nvc0_hw_sm_query_cfg **queries;
2285 struct nvc0_screen *screen = nvc0->screen;
2286 struct nvc0_query *q = &hq->base;
2287 unsigned num_queries;
2288 unsigned i;
2289
2290 num_queries = nvc0_hw_sm_get_num_queries(screen);
2291 queries = nvc0_hw_sm_get_queries(screen);
2292
2293 for (i = 0; i < num_queries; i++) {
2294 if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type)
2295 return queries[i];
2296 }
2297 assert(0);
2298 return NULL;
2299 }
2300
2301 static void
nvc0_hw_sm_destroy_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)2302 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2303 {
2304 struct nvc0_query *q = &hq->base;
2305 nvc0_hw_query_allocate(nvc0, q, 0);
2306 nouveau_fence_ref(NULL, &hq->fence);
2307 FREE(hq);
2308 }
2309
2310 static bool
nve4_hw_sm_begin_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)2311 nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2312 {
2313 struct nvc0_screen *screen = nvc0->screen;
2314 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
2315 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2316 const struct nvc0_hw_sm_query_cfg *cfg;
2317 unsigned i, c;
2318 unsigned num_ab[2] = { 0, 0 };
2319
2320 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
2321
2322 /* check if we have enough free counter slots */
2323 for (i = 0; i < cfg->num_counters; ++i)
2324 num_ab[cfg->ctr[i].sig_dom]++;
2325
2326 if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
2327 screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
2328 NOUVEAU_ERR("Not enough free MP counter slots !\n");
2329 return false;
2330 }
2331
2332 assert(cfg->num_counters <= 4);
2333 PUSH_SPACE(push, 4 * 8 * + 6);
2334
2335 if (!screen->pm.mp_counters_enabled) {
2336 screen->pm.mp_counters_enabled = true;
2337 BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
2338 PUSH_DATA (push, 0x1fcb);
2339 }
2340
2341 /* set sequence field to 0 (used to check if result is available) */
2342 for (i = 0; i < screen->mp_count; ++i)
2343 hq->data[i * 10 + 10] = 0;
2344 hq->sequence++;
2345
2346 for (i = 0; i < cfg->num_counters; ++i) {
2347 const unsigned d = cfg->ctr[i].sig_dom;
2348
2349 if (!screen->pm.num_hw_sm_active[d]) {
2350 uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
2351 if (screen->pm.num_hw_sm_active[!d])
2352 m |= 1 << (7 + (8 * d));
2353 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
2354 PUSH_DATA (push, m);
2355 }
2356 screen->pm.num_hw_sm_active[d]++;
2357
2358 for (c = d * 4; c < (d * 4 + 4); ++c) {
2359 if (!screen->pm.mp_counter[c]) {
2360 hsq->ctr[i] = c;
2361 screen->pm.mp_counter[c] = hsq;
2362 break;
2363 }
2364 }
2365 assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
2366
2367 /* configure and reset the counter(s) */
2368 if (d == 0)
2369 BEGIN_NVC0(push, NVE4_CP(MP_PM_A_SIGSEL(c & 3)), 1);
2370 else
2371 BEGIN_NVC0(push, NVE4_CP(MP_PM_B_SIGSEL(c & 3)), 1);
2372 PUSH_DATA (push, cfg->ctr[i].sig_sel);
2373 BEGIN_NVC0(push, NVE4_CP(MP_PM_SRCSEL(c)), 1);
2374 PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
2375 BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 1);
2376 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
2377 BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1);
2378 PUSH_DATA (push, 0);
2379 }
2380
2381 if (screen->base.class_3d >= GM107_3D_CLASS) {
2382 /* Enable mask for counters, it's 8-bits value where 0:3 is for domain A
2383 * and 4:7 for domain B. For example, the mask for active_warps should be
2384 * 0x70 because it uses 3 counters in domain B. However, let's always
2385 * enable all counters because we don't want to track which ones is
2386 * enabled or not, and this allows to monitor multiple queries at the
2387 * same time. */
2388 BEGIN_NVC0(push, SUBC_CP(0x33e0), 1);
2389 PUSH_DATA (push, 0xff);
2390 }
2391
2392 return true;
2393 }
2394
2395 static bool
nvc0_hw_sm_begin_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)2396 nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2397 {
2398 struct nvc0_screen *screen = nvc0->screen;
2399 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
2400 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2401 const struct nvc0_hw_sm_query_cfg *cfg;
2402 unsigned i, c;
2403
2404 if (screen->base.class_3d >= NVE4_3D_CLASS)
2405 return nve4_hw_sm_begin_query(nvc0, hq);
2406
2407 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
2408
2409 /* check if we have enough free counter slots */
2410 if (screen->pm.num_hw_sm_active[0] + cfg->num_counters > 8) {
2411 NOUVEAU_ERR("Not enough free MP counter slots !\n");
2412 return false;
2413 }
2414
2415 assert(cfg->num_counters <= 8);
2416 PUSH_SPACE(push, 8 * 8 + 2);
2417
2418 /* set sequence field to 0 (used to check if result is available) */
2419 for (i = 0; i < screen->mp_count; ++i) {
2420 const unsigned b = (0x30 / 4) * i;
2421 hq->data[b + 8] = 0;
2422 }
2423 hq->sequence++;
2424
2425 for (i = 0; i < cfg->num_counters; ++i) {
2426 uint32_t mask_sel = 0x00000000;
2427
2428 if (!screen->pm.num_hw_sm_active[0]) {
2429 BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
2430 PUSH_DATA (push, 0x80000000);
2431 }
2432 screen->pm.num_hw_sm_active[0]++;
2433
2434 for (c = 0; c < 8; ++c) {
2435 if (!screen->pm.mp_counter[c]) {
2436 hsq->ctr[i] = c;
2437 screen->pm.mp_counter[c] = hsq;
2438 break;
2439 }
2440 }
2441
2442 /* Oddly-enough, the signal id depends on the slot selected on Fermi but
2443 * not on Kepler. Fortunately, the signal ids are just offsetted by the
2444 * slot id! */
2445 mask_sel |= c;
2446 mask_sel |= (c << 8);
2447 mask_sel |= (c << 16);
2448 mask_sel |= (c << 24);
2449 mask_sel &= cfg->ctr[i].src_mask;
2450
2451 /* configure and reset the counter(s) */
2452 BEGIN_NVC0(push, NVC0_CP(MP_PM_SIGSEL(c)), 1);
2453 PUSH_DATA (push, cfg->ctr[i].sig_sel);
2454 BEGIN_NVC0(push, NVC0_CP(MP_PM_SRCSEL(c)), 1);
2455 PUSH_DATA (push, cfg->ctr[i].src_sel | mask_sel);
2456 BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(c)), 1);
2457 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
2458 BEGIN_NVC0(push, NVC0_CP(MP_PM_SET(c)), 1);
2459 PUSH_DATA (push, 0);
2460 }
2461 return true;
2462 }
2463
2464 static inline struct nvc0_program *
nvc0_hw_sm_get_program(struct nvc0_screen * screen)2465 nvc0_hw_sm_get_program(struct nvc0_screen *screen)
2466 {
2467 struct nvc0_program *prog;
2468
2469 prog = CALLOC_STRUCT(nvc0_program);
2470 if (!prog)
2471 return NULL;
2472
2473 prog->type = PIPE_SHADER_COMPUTE;
2474 prog->translated = true;
2475 prog->parm_size = 12;
2476
2477 if (screen->base.class_3d >= GM107_3D_CLASS) {
2478 prog->code = (uint32_t *)gm107_read_hw_sm_counters_code;
2479 prog->code_size = sizeof(gm107_read_hw_sm_counters_code);
2480 prog->num_gprs = 14;
2481 } else
2482 if (screen->base.class_3d == NVE4_3D_CLASS ||
2483 screen->base.class_3d == NVF0_3D_CLASS) {
2484 if (screen->base.class_3d == NVE4_3D_CLASS) {
2485 prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
2486 prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
2487 } else {
2488 prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code;
2489 prog->code_size = sizeof(nvf0_read_hw_sm_counters_code);
2490 }
2491 prog->num_gprs = 14;
2492 } else {
2493 prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
2494 prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
2495 prog->num_gprs = 12;
2496 }
2497 return prog;
2498 }
2499
2500 static inline void
nvc0_hw_sm_upload_input(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)2501 nvc0_hw_sm_upload_input(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2502 {
2503 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
2504 struct nvc0_screen *screen = nvc0->screen;
2505 uint64_t address;
2506 const int s = 5;
2507
2508 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
2509
2510 PUSH_SPACE(push, 11);
2511
2512 if (screen->base.class_3d >= NVE4_3D_CLASS) {
2513 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
2514 PUSH_DATAh(push, address + NVC0_CB_AUX_MP_INFO);
2515 PUSH_DATA (push, address + NVC0_CB_AUX_MP_INFO);
2516 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
2517 PUSH_DATA (push, 3 * 4);
2518 PUSH_DATA (push, 0x1);
2519 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 3);
2520 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
2521 } else {
2522 BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
2523 PUSH_DATA (push, NVC0_CB_AUX_SIZE);
2524 PUSH_DATAh(push, address);
2525 PUSH_DATA (push, address);
2526 BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 3);
2527 PUSH_DATA (push, NVC0_CB_AUX_MP_INFO);
2528 }
2529 PUSH_DATA (push, (hq->bo->offset + hq->base_offset));
2530 PUSH_DATAh(push, (hq->bo->offset + hq->base_offset));
2531 PUSH_DATA (push, hq->sequence);
2532 }
2533
2534 static void
nvc0_hw_sm_end_query(struct nvc0_context * nvc0,struct nvc0_hw_query * hq)2535 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
2536 {
2537 struct nvc0_screen *screen = nvc0->screen;
2538 struct pipe_context *pipe = &nvc0->base.pipe;
2539 struct nouveau_pushbuf *push = nvc0->base.pushbuf;
2540 const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
2541 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2542 struct nvc0_program *old = nvc0->compprog;
2543 struct pipe_grid_info info = {};
2544 uint32_t mask;
2545 uint32_t input[3];
2546 const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
2547 const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
2548 unsigned c, i;
2549
2550 if (unlikely(!screen->pm.prog))
2551 screen->pm.prog = nvc0_hw_sm_get_program(screen);
2552
2553 /* disable all counting */
2554 PUSH_SPACE(push, 8);
2555 for (c = 0; c < 8; ++c)
2556 if (screen->pm.mp_counter[c]) {
2557 if (is_nve4) {
2558 IMMED_NVC0(push, NVE4_CP(MP_PM_FUNC(c)), 0);
2559 } else {
2560 IMMED_NVC0(push, NVC0_CP(MP_PM_OP(c)), 0);
2561 }
2562 }
2563 /* release counters for this query */
2564 for (c = 0; c < 8; ++c) {
2565 if (screen->pm.mp_counter[c] == hsq) {
2566 uint8_t d = is_nve4 ? c / 4 : 0; /* only one domain for NVC0:NVE4 */
2567 screen->pm.num_hw_sm_active[d]--;
2568 screen->pm.mp_counter[c] = NULL;
2569 }
2570 }
2571
2572 if (screen->base.class_3d >= GM107_3D_CLASS)
2573 IMMED_NVC0(push, SUBC_CP(0x33e0), 0);
2574
2575 BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
2576 hq->bo);
2577
2578 PUSH_SPACE(push, 1);
2579 IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0);
2580
2581 /* upload input data for the compute shader which reads MP counters */
2582 nvc0_hw_sm_upload_input(nvc0, hq);
2583
2584 pipe->bind_compute_state(pipe, screen->pm.prog);
2585 for (i = 0; i < 3; i++) {
2586 info.block[i] = block[i];
2587 info.grid[i] = grid[i];
2588 }
2589 info.pc = 0;
2590 info.input = input;
2591 pipe->launch_grid(pipe, &info);
2592 pipe->bind_compute_state(pipe, old);
2593
2594 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
2595
2596 /* re-activate other counters */
2597 PUSH_SPACE(push, 16);
2598 mask = 0;
2599 for (c = 0; c < 8; ++c) {
2600 const struct nvc0_hw_sm_query_cfg *cfg;
2601 unsigned i;
2602
2603 hsq = screen->pm.mp_counter[c];
2604 if (!hsq)
2605 continue;
2606
2607 cfg = nvc0_hw_sm_query_get_cfg(nvc0, &hsq->base);
2608 for (i = 0; i < cfg->num_counters; ++i) {
2609 if (mask & (1 << hsq->ctr[i]))
2610 break;
2611 mask |= 1 << hsq->ctr[i];
2612 if (is_nve4) {
2613 BEGIN_NVC0(push, NVE4_CP(MP_PM_FUNC(hsq->ctr[i])), 1);
2614 } else {
2615 BEGIN_NVC0(push, NVC0_CP(MP_PM_OP(hsq->ctr[i])), 1);
2616 }
2617 PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
2618 }
2619 }
2620 }
2621
2622 static inline bool
nvc0_hw_sm_query_read_data(uint32_t count[32][8],struct nvc0_context * nvc0,bool wait,struct nvc0_hw_query * hq,const struct nvc0_hw_sm_query_cfg * cfg,unsigned mp_count)2623 nvc0_hw_sm_query_read_data(uint32_t count[32][8],
2624 struct nvc0_context *nvc0, bool wait,
2625 struct nvc0_hw_query *hq,
2626 const struct nvc0_hw_sm_query_cfg *cfg,
2627 unsigned mp_count)
2628 {
2629 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2630 unsigned p, c;
2631
2632 for (p = 0; p < mp_count; ++p) {
2633 const unsigned b = (0x30 / 4) * p;
2634
2635 for (c = 0; c < cfg->num_counters; ++c) {
2636 if (hq->data[b + 8] != hq->sequence) {
2637 if (!wait)
2638 return false;
2639 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
2640 return false;
2641 }
2642 count[p][c] = hq->data[b + hsq->ctr[c]] * (1 << c);
2643 }
2644 }
2645 return true;
2646 }
2647
2648 static inline bool
nve4_hw_sm_query_read_data(uint32_t count[32][8],struct nvc0_context * nvc0,bool wait,struct nvc0_hw_query * hq,const struct nvc0_hw_sm_query_cfg * cfg,unsigned mp_count)2649 nve4_hw_sm_query_read_data(uint32_t count[32][8],
2650 struct nvc0_context *nvc0, bool wait,
2651 struct nvc0_hw_query *hq,
2652 const struct nvc0_hw_sm_query_cfg *cfg,
2653 unsigned mp_count)
2654 {
2655 struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
2656 unsigned p, c, d;
2657
2658 for (p = 0; p < mp_count; ++p) {
2659 const unsigned b = (0x60 / 4) * p;
2660
2661 for (c = 0; c < cfg->num_counters; ++c) {
2662 count[p][c] = 0;
2663 for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
2664 if (hq->data[b + 20 + d] != hq->sequence) {
2665 if (!wait)
2666 return false;
2667 if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nvc0->base.client))
2668 return false;
2669 }
2670 if (hsq->ctr[c] & ~0x3)
2671 count[p][c] = hq->data[b + 16 + (hsq->ctr[c] & 3)];
2672 else
2673 count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
2674 }
2675 }
2676 }
2677 return true;
2678 }
2679
2680 static bool
nvc0_hw_sm_get_query_result(struct nvc0_context * nvc0,struct nvc0_hw_query * hq,bool wait,union pipe_query_result * result)2681 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
2682 bool wait, union pipe_query_result *result)
2683 {
2684 uint32_t count[32][8];
2685 uint64_t value = 0;
2686 unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
2687 unsigned p, c;
2688 const struct nvc0_hw_sm_query_cfg *cfg;
2689 bool ret;
2690
2691 cfg = nvc0_hw_sm_query_get_cfg(nvc0, hq);
2692
2693 if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
2694 ret = nve4_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
2695 else
2696 ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, hq, cfg, mp_count);
2697 if (!ret)
2698 return false;
2699
2700 for (c = 0; c < cfg->num_counters; ++c)
2701 for (p = 0; p < mp_count; ++p)
2702 value += count[p][c];
2703 value = (value * cfg->norm[0]) / cfg->norm[1];
2704
2705 *(uint64_t *)result = value;
2706 return true;
2707 }
2708
2709 static const struct nvc0_hw_query_funcs hw_sm_query_funcs = {
2710 .destroy_query = nvc0_hw_sm_destroy_query,
2711 .begin_query = nvc0_hw_sm_begin_query,
2712 .end_query = nvc0_hw_sm_end_query,
2713 .get_query_result = nvc0_hw_sm_get_query_result,
2714 };
2715
2716 struct nvc0_hw_query *
nvc0_hw_sm_create_query(struct nvc0_context * nvc0,unsigned type)2717 nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
2718 {
2719 struct nvc0_screen *screen = nvc0->screen;
2720 struct nvc0_hw_sm_query *hsq;
2721 struct nvc0_hw_query *hq;
2722 unsigned space;
2723
2724 if (nvc0->screen->base.drm->version < 0x01000101)
2725 return NULL;
2726
2727 if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)
2728 return NULL;
2729
2730 hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
2731 if (!hsq)
2732 return NULL;
2733
2734 hq = &hsq->base;
2735 hq->funcs = &hw_sm_query_funcs;
2736 hq->base.type = type;
2737
2738 if (screen->base.class_3d >= NVE4_3D_CLASS) {
2739 /* for each MP:
2740 * [00] = WS0.C0
2741 * [04] = WS0.C1
2742 * [08] = WS0.C2
2743 * [0c] = WS0.C3
2744 * [10] = WS1.C0
2745 * [14] = WS1.C1
2746 * [18] = WS1.C2
2747 * [1c] = WS1.C3
2748 * [20] = WS2.C0
2749 * [24] = WS2.C1
2750 * [28] = WS2.C2
2751 * [2c] = WS2.C3
2752 * [30] = WS3.C0
2753 * [34] = WS3.C1
2754 * [38] = WS3.C2
2755 * [3c] = WS3.C3
2756 * [40] = MP.C4
2757 * [44] = MP.C5
2758 * [48] = MP.C6
2759 * [4c] = MP.C7
2760 * [50] = WS0.sequence
2761 * [54] = WS1.sequence
2762 * [58] = WS2.sequence
2763 * [5c] = WS3.sequence
2764 */
2765 space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
2766 } else {
2767 /*
2768 * Note that padding is used to align memory access to 128 bits.
2769 *
2770 * for each MP:
2771 * [00] = MP.C0
2772 * [04] = MP.C1
2773 * [08] = MP.C2
2774 * [0c] = MP.C3
2775 * [10] = MP.C4
2776 * [14] = MP.C5
2777 * [18] = MP.C6
2778 * [1c] = MP.C7
2779 * [20] = MP.sequence
2780 * [24] = padding
2781 * [28] = padding
2782 * [2c] = padding
2783 */
2784 space = (8 + 1 + 3) * nvc0->screen->mp_count * sizeof(uint32_t);
2785 }
2786
2787 if (!nvc0_hw_query_allocate(nvc0, &hq->base, space)) {
2788 FREE(hq);
2789 return NULL;
2790 }
2791
2792 return hq;
2793 }
2794
2795 int
nvc0_hw_sm_get_driver_query_info(struct nvc0_screen * screen,unsigned id,struct pipe_driver_query_info * info)2796 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
2797 struct pipe_driver_query_info *info)
2798 {
2799 int count = 0;
2800
2801 if (screen->base.drm->version >= 0x01000101) {
2802 if (screen->compute)
2803 count = nvc0_hw_sm_get_num_queries(screen);
2804 }
2805
2806 if (!info)
2807 return count;
2808
2809 if (id < count) {
2810 if (screen->compute) {
2811 if (screen->base.class_3d <= GM200_3D_CLASS) {
2812 const struct nvc0_hw_sm_query_cfg **queries =
2813 nvc0_hw_sm_get_queries(screen);
2814
2815 info->name = nvc0_hw_sm_query_get_name(queries[id]->type);
2816 info->query_type = NVC0_HW_SM_QUERY(queries[id]->type);
2817 info->group_id = NVC0_HW_SM_QUERY_GROUP;
2818 return 1;
2819 }
2820 }
2821 }
2822 return 0;
2823 }
2824