1 /*
2 * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
3 * Copyright © 2018 Google, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 * Rob Clark <robclark@freedesktop.org>
26 */
27
28 #define FD_BO_NO_HARDPIN 1
29
30 /* NOTE: see https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A5xx-Queries */
31
32 #include "freedreno_query_acc.h"
33 #include "freedreno_resource.h"
34
35 #include "fd6_context.h"
36 #include "fd6_emit.h"
37 #include "fd6_query.h"
38
39 /* g++ is a picky about offsets that cannot be resolved at compile time, so
40 * roll our own __offsetof()
41 */
42 #define __offsetof(type, field) \
43 ({ type _x = {}; ((uint8_t *)&_x.field) - ((uint8_t *)&_x);})
44
45 struct PACKED fd6_query_sample {
46 struct fd_acc_query_sample base;
47
48 /* The RB_SAMPLE_COUNT_ADDR destination needs to be 16-byte aligned: */
49 uint64_t pad;
50
51 uint64_t start;
52 uint64_t result;
53 uint64_t stop;
54 };
55 DEFINE_CAST(fd_acc_query_sample, fd6_query_sample);
56
57 /* offset of a single field of an array of fd6_query_sample: */
58 #define query_sample_idx(aq, idx, field) \
59 fd_resource((aq)->prsc)->bo, \
60 (idx * sizeof(struct fd6_query_sample)) + \
61 offsetof(struct fd6_query_sample, field), \
62 0, 0
63
64 /* offset of a single field of fd6_query_sample: */
65 #define query_sample(aq, field) query_sample_idx(aq, 0, field)
66
67 /*
68 * Occlusion Query:
69 *
70 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
71 * interpret results
72 */
73
74 static void
occlusion_resume(struct fd_acc_query * aq,struct fd_batch * batch)75 occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
76 {
77 struct fd_ringbuffer *ring = batch->draw;
78
79 ASSERT_ALIGNED(struct fd6_query_sample, start, 16);
80
81 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
82 OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
83
84 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
85 OUT_RELOC(ring, query_sample(aq, start));
86
87 fd6_event_write(batch, ring, ZPASS_DONE, false);
88 }
89
90 static void
occlusion_pause(struct fd_acc_query * aq,struct fd_batch * batch)91 occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
92 {
93 struct fd_ringbuffer *ring = batch->draw;
94
95 OUT_PKT7(ring, CP_MEM_WRITE, 4);
96 OUT_RELOC(ring, query_sample(aq, stop));
97 OUT_RING(ring, 0xffffffff);
98 OUT_RING(ring, 0xffffffff);
99
100 OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
101
102 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
103 OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
104
105 ASSERT_ALIGNED(struct fd6_query_sample, stop, 16);
106
107 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
108 OUT_RELOC(ring, query_sample(aq, stop));
109
110 fd6_event_write(batch, ring, ZPASS_DONE, false);
111
112 /* To avoid stalling in the draw buffer, emit code the code to compute the
113 * counter delta in the epilogue ring.
114 */
115 struct fd_ringbuffer *epilogue = fd_batch_get_tile_epilogue(batch);
116
117 OUT_PKT7(epilogue, CP_WAIT_REG_MEM, 6);
118 OUT_RING(epilogue, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
119 CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
120 OUT_RELOC(epilogue, query_sample(aq, stop));
121 OUT_RING(epilogue, CP_WAIT_REG_MEM_3_REF(0xffffffff));
122 OUT_RING(epilogue, CP_WAIT_REG_MEM_4_MASK(0xffffffff));
123 OUT_RING(epilogue, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
124
125 /* result += stop - start: */
126 OUT_PKT7(epilogue, CP_MEM_TO_MEM, 9);
127 OUT_RING(epilogue, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
128 OUT_RELOC(epilogue, query_sample(aq, result)); /* dst */
129 OUT_RELOC(epilogue, query_sample(aq, result)); /* srcA */
130 OUT_RELOC(epilogue, query_sample(aq, stop)); /* srcB */
131 OUT_RELOC(epilogue, query_sample(aq, start)); /* srcC */
132 }
133
134 static void
occlusion_counter_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)135 occlusion_counter_result(struct fd_acc_query *aq,
136 struct fd_acc_query_sample *s,
137 union pipe_query_result *result)
138 {
139 struct fd6_query_sample *sp = fd6_query_sample(s);
140 result->u64 = sp->result;
141 }
142
143 static void
occlusion_counter_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)144 occlusion_counter_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
145 enum pipe_query_value_type result_type,
146 int index, struct fd_resource *dst,
147 unsigned offset)
148 {
149 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
150 offsetof(struct fd6_query_sample, result));
151 }
152
153 static void
occlusion_predicate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)154 occlusion_predicate_result(struct fd_acc_query *aq,
155 struct fd_acc_query_sample *s,
156 union pipe_query_result *result)
157 {
158 struct fd6_query_sample *sp = fd6_query_sample(s);
159 result->b = !!sp->result;
160 }
161
162 static void
occlusion_predicate_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)163 occlusion_predicate_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
164 enum pipe_query_value_type result_type,
165 int index, struct fd_resource *dst,
166 unsigned offset)
167 {
168 /* This is a bit annoying but we need to turn the result into a one or
169 * zero.. to do this use a CP_COND_WRITE to overwrite the result with
170 * a one if it is non-zero. This doesn't change the results if the
171 * query is also read on the CPU (ie. occlusion_predicate_result()).
172 */
173 OUT_PKT7(ring, CP_COND_WRITE5, 9);
174 OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) |
175 CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY) |
176 CP_COND_WRITE5_0_WRITE_MEMORY);
177 OUT_RELOC(ring, query_sample(aq, result)); /* POLL_ADDR_LO/HI */
178 OUT_RING(ring, CP_COND_WRITE5_3_REF(0));
179 OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
180 OUT_RELOC(ring, query_sample(aq, result)); /* WRITE_ADDR_LO/HI */
181 OUT_RING(ring, 1);
182 OUT_RING(ring, 0);
183
184 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
185 offsetof(struct fd6_query_sample, result));
186 }
187
188 static const struct fd_acc_sample_provider occlusion_counter = {
189 .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
190 .size = sizeof(struct fd6_query_sample),
191 .resume = occlusion_resume,
192 .pause = occlusion_pause,
193 .result = occlusion_counter_result,
194 .result_resource = occlusion_counter_result_resource,
195 };
196
197 static const struct fd_acc_sample_provider occlusion_predicate = {
198 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
199 .size = sizeof(struct fd6_query_sample),
200 .resume = occlusion_resume,
201 .pause = occlusion_pause,
202 .result = occlusion_predicate_result,
203 .result_resource = occlusion_predicate_result_resource,
204 };
205
206 static const struct fd_acc_sample_provider occlusion_predicate_conservative = {
207 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
208 .size = sizeof(struct fd6_query_sample),
209 .resume = occlusion_resume,
210 .pause = occlusion_pause,
211 .result = occlusion_predicate_result,
212 .result_resource = occlusion_predicate_result_resource,
213 };
214
215 /*
216 * Timestamp Queries:
217 */
218
219 static void
timestamp_resume(struct fd_acc_query * aq,struct fd_batch * batch)220 timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch)
221 {
222 struct fd_ringbuffer *ring = batch->draw;
223
224 OUT_PKT7(ring, CP_EVENT_WRITE, 4);
225 OUT_RING(ring,
226 CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
227 OUT_RELOC(ring, query_sample(aq, start));
228 OUT_RING(ring, 0x00000000);
229 }
230
231 static void
time_elapsed_pause(struct fd_acc_query * aq,struct fd_batch * batch)232 time_elapsed_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
233 {
234 struct fd_ringbuffer *ring = batch->draw;
235
236 OUT_PKT7(ring, CP_EVENT_WRITE, 4);
237 OUT_RING(ring,
238 CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
239 OUT_RELOC(ring, query_sample(aq, stop));
240 OUT_RING(ring, 0x00000000);
241
242 OUT_WFI5(ring);
243
244 /* result += stop - start: */
245 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
246 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
247 OUT_RELOC(ring, query_sample(aq, result)); /* dst */
248 OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
249 OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */
250 OUT_RELOC(ring, query_sample(aq, start)); /* srcC */
251 }
252
253 static void
timestamp_pause(struct fd_acc_query * aq,struct fd_batch * batch)254 timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch)
255 {
256 /* We captured a timestamp in timestamp_resume(), nothing to do here. */
257 }
258
259 /* timestamp logging for u_trace: */
260 static void
record_timestamp(struct fd_ringbuffer * ring,struct fd_bo * bo,unsigned offset)261 record_timestamp(struct fd_ringbuffer *ring, struct fd_bo *bo, unsigned offset)
262 {
263 OUT_PKT7(ring, CP_EVENT_WRITE, 4);
264 OUT_RING(ring,
265 CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
266 OUT_RELOC(ring, bo, offset, 0, 0);
267 OUT_RING(ring, 0x00000000);
268 }
269
270 static void
time_elapsed_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)271 time_elapsed_accumulate_result(struct fd_acc_query *aq,
272 struct fd_acc_query_sample *s,
273 union pipe_query_result *result)
274 {
275 struct fd6_query_sample *sp = fd6_query_sample(s);
276 result->u64 = ticks_to_ns(sp->result);
277 }
278
279 static void
time_elapsed_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)280 time_elapsed_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
281 enum pipe_query_value_type result_type,
282 int index, struct fd_resource *dst,
283 unsigned offset)
284 {
285 // TODO ticks_to_ns conversion would require spinning up a compute shader?
286 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
287 offsetof(struct fd6_query_sample, result));
288 }
289
290 static void
timestamp_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)291 timestamp_accumulate_result(struct fd_acc_query *aq,
292 struct fd_acc_query_sample *s,
293 union pipe_query_result *result)
294 {
295 struct fd6_query_sample *sp = fd6_query_sample(s);
296 result->u64 = ticks_to_ns(sp->start);
297 }
298
299 static void
timestamp_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)300 timestamp_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
301 enum pipe_query_value_type result_type,
302 int index, struct fd_resource *dst,
303 unsigned offset)
304 {
305 // TODO ticks_to_ns conversion would require spinning up a compute shader?
306 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
307 offsetof(struct fd6_query_sample, start));
308 }
309
310 static const struct fd_acc_sample_provider time_elapsed = {
311 .query_type = PIPE_QUERY_TIME_ELAPSED,
312 .always = true,
313 .size = sizeof(struct fd6_query_sample),
314 .resume = timestamp_resume,
315 .pause = time_elapsed_pause,
316 .result = time_elapsed_accumulate_result,
317 .result_resource = time_elapsed_result_resource,
318 };
319
320 /* NOTE: timestamp query isn't going to give terribly sensible results
321 * on a tiler. But it is needed by qapitrace profile heatmap. If you
322 * add in a binning pass, the results get even more non-sensical. So
323 * we just return the timestamp on the last tile and hope that is
324 * kind of good enough.
325 */
326
327 static const struct fd_acc_sample_provider timestamp = {
328 .query_type = PIPE_QUERY_TIMESTAMP,
329 .always = true,
330 .size = sizeof(struct fd6_query_sample),
331 .resume = timestamp_resume,
332 .pause = timestamp_pause,
333 .result = timestamp_accumulate_result,
334 .result_resource = timestamp_result_resource,
335 };
336
337 struct PACKED fd6_pipeline_stats_sample {
338 struct fd_acc_query_sample base;
339
340 uint64_t start, stop, result;
341 };
342 DEFINE_CAST(fd_acc_query_sample, fd6_pipeline_stats_sample);
343
344 #define stats_reloc(ring, aq, field) \
345 OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \
346 __offsetof(struct fd6_pipeline_stats_sample, field), 0, 0);
347
348 /* Mapping of counters to pipeline stats:
349 *
350 * Gallium (PIPE_STAT_QUERY_x) | Vulkan (VK_QUERY_PIPELINE_STATISTIC_x_BIT) | hw counter
351 * ----------------------------+--------------------------------------------+----------------
352 * IA_VERTICES | INPUT_ASSEMBLY_VERTICES | RBBM_PRIMCTR_0
353 * IA_PRIMITIVES | INPUT_ASSEMBLY_PRIMITIVES | RBBM_PRIMCTR_1
354 * VS_INVOCATIONS | VERTEX_SHADER_INVOCATIONS | RBBM_PRIMCTR_0
355 * GS_INVOCATIONS | GEOMETRY_SHADER_INVOCATIONS | RBBM_PRIMCTR_5
356 * GS_PRIMITIVES | GEOMETRY_SHADER_PRIMITIVES | RBBM_PRIMCTR_6
357 * C_INVOCATIONS | CLIPPING_INVOCATIONS | RBBM_PRIMCTR_7
358 * C_PRIMITIVES | CLIPPING_PRIMITIVES | RBBM_PRIMCTR_8
359 * PS_INVOCATIONS | FRAGMENT_SHADER_INVOCATIONS | RBBM_PRIMCTR_9
360 * HS_INVOCATIONS | TESSELLATION_CONTROL_SHADER_PATCHES | RBBM_PRIMCTR_2
361 * DS_INVOCATIONS | TESSELLATION_EVALUATION_SHADER_INVOCATIONS | RBBM_PRIMCTR_4
362 * CS_INVOCATIONS | COMPUTE_SHADER_INVOCATIONS | RBBM_PRIMCTR_10
363 *
364 * Note that "Vertices corresponding to incomplete primitives may contribute to the count.",
365 * in our case they do not, so IA_VERTICES and VS_INVOCATIONS are the same thing.
366 */
367
368 enum stats_type {
369 STATS_PRIMITIVE,
370 STATS_FRAGMENT,
371 STATS_COMPUTE,
372 };
373
374 static const struct {
375 enum vgt_event_type start, stop;
376 } stats_counter_events[] = {
377 [STATS_PRIMITIVE] = { START_PRIMITIVE_CTRS, STOP_PRIMITIVE_CTRS },
378 [STATS_FRAGMENT] = { START_FRAGMENT_CTRS, STOP_FRAGMENT_CTRS },
379 [STATS_COMPUTE] = { START_COMPUTE_CTRS, STOP_COMPUTE_CTRS },
380 };
381
382 static enum stats_type
get_stats_type(struct fd_acc_query * aq)383 get_stats_type(struct fd_acc_query *aq)
384 {
385 if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
386 return STATS_PRIMITIVE;
387
388 switch (aq->base.index) {
389 case PIPE_STAT_QUERY_PS_INVOCATIONS: return STATS_FRAGMENT;
390 case PIPE_STAT_QUERY_CS_INVOCATIONS: return STATS_COMPUTE;
391 default:
392 return STATS_PRIMITIVE;
393 }
394 }
395
396 static unsigned
stats_counter_index(struct fd_acc_query * aq)397 stats_counter_index(struct fd_acc_query *aq)
398 {
399 if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
400 return 7;
401
402 switch (aq->base.index) {
403 case PIPE_STAT_QUERY_IA_VERTICES: return 0;
404 case PIPE_STAT_QUERY_IA_PRIMITIVES: return 1;
405 case PIPE_STAT_QUERY_VS_INVOCATIONS: return 0;
406 case PIPE_STAT_QUERY_GS_INVOCATIONS: return 5;
407 case PIPE_STAT_QUERY_GS_PRIMITIVES: return 6;
408 case PIPE_STAT_QUERY_C_INVOCATIONS: return 7;
409 case PIPE_STAT_QUERY_C_PRIMITIVES: return 8;
410 case PIPE_STAT_QUERY_PS_INVOCATIONS: return 9;
411 case PIPE_STAT_QUERY_HS_INVOCATIONS: return 2;
412 case PIPE_STAT_QUERY_DS_INVOCATIONS: return 4;
413 case PIPE_STAT_QUERY_CS_INVOCATIONS: return 10;
414 default:
415 return 0;
416 }
417 }
418
419 static void
log_pipeline_stats(struct fd6_pipeline_stats_sample * ps,unsigned idx)420 log_pipeline_stats(struct fd6_pipeline_stats_sample *ps, unsigned idx)
421 {
422 #ifdef DEBUG_COUNTERS
423 const char *labels[] = {
424 "VS_INVOCATIONS",
425 "IA_PRIMITIVES",
426 "HS_INVOCATIONS",
427 "??",
428 "DS_INVOCATIONS",
429 "GS_INVOCATIONS",
430 "GS_PRIMITIVES",
431 "C_INVOCATIONS",
432 "C_PRIMITIVES",
433 "PS_INVOCATIONS",
434 "CS_INVOCATIONS",
435 };
436
437 mesa_logd(" counter\t\tstart\t\t\tstop\t\t\tdiff");
438 mesa_logd(" RBBM_PRIMCTR_%d\t0x%016" PRIx64 "\t0x%016" PRIx64 "\t%" PRIi64 "\t%s",
439 idx, ps->start, ps->stop, ps->stop - ps->start, labels[idx]);
440 #endif
441 }
442
443 static void
pipeline_stats_resume(struct fd_acc_query * aq,struct fd_batch * batch)444 pipeline_stats_resume(struct fd_acc_query *aq, struct fd_batch *batch)
445 assert_dt
446 {
447 struct fd_ringbuffer *ring = batch->draw;
448 enum stats_type type = get_stats_type(aq);
449 unsigned idx = stats_counter_index(aq);
450 unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
451
452 OUT_WFI5(ring);
453
454 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
455 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
456 CP_REG_TO_MEM_0_CNT(2) |
457 CP_REG_TO_MEM_0_REG(reg));
458 stats_reloc(ring, aq, start);
459
460 assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
461
462 if (!batch->pipeline_stats_queries_active[type])
463 fd6_event_write(batch, ring, stats_counter_events[type].start, false);
464 batch->pipeline_stats_queries_active[type]++;
465 }
466
467 static void
pipeline_stats_pause(struct fd_acc_query * aq,struct fd_batch * batch)468 pipeline_stats_pause(struct fd_acc_query *aq, struct fd_batch *batch)
469 assert_dt
470 {
471 struct fd_ringbuffer *ring = batch->draw;
472 enum stats_type type = get_stats_type(aq);
473 unsigned idx = stats_counter_index(aq);
474 unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
475
476 OUT_WFI5(ring);
477
478 /* snapshot the end values: */
479 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
480 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
481 CP_REG_TO_MEM_0_CNT(2) |
482 CP_REG_TO_MEM_0_REG(reg));
483 stats_reloc(ring, aq, stop);
484
485 assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
486 assert(batch->pipeline_stats_queries_active[type] > 0);
487
488 batch->pipeline_stats_queries_active[type]--;
489 if (batch->pipeline_stats_queries_active[type])
490 fd6_event_write(batch, ring, stats_counter_events[type].stop, false);
491
492 /* result += stop - start: */
493 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
494 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x40000000);
495 stats_reloc(ring, aq, result);
496 stats_reloc(ring, aq, result);
497 stats_reloc(ring, aq, stop)
498 stats_reloc(ring, aq, start);
499 }
500
501 static void
pipeline_stats_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)502 pipeline_stats_result(struct fd_acc_query *aq,
503 struct fd_acc_query_sample *s,
504 union pipe_query_result *result)
505 {
506 struct fd6_pipeline_stats_sample *ps = fd6_pipeline_stats_sample(s);
507
508 log_pipeline_stats(ps, stats_counter_index(aq));
509
510 result->u64 = ps->result;
511 }
512
513 static void
pipeline_stats_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)514 pipeline_stats_result_resource(struct fd_acc_query *aq,
515 struct fd_ringbuffer *ring,
516 enum pipe_query_value_type result_type,
517 int index, struct fd_resource *dst,
518 unsigned offset)
519 {
520 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
521 offsetof(struct fd6_pipeline_stats_sample, result));
522 }
523
524 static const struct fd_acc_sample_provider primitives_generated = {
525 .query_type = PIPE_QUERY_PRIMITIVES_GENERATED,
526 .size = sizeof(struct fd6_pipeline_stats_sample),
527 .resume = pipeline_stats_resume,
528 .pause = pipeline_stats_pause,
529 .result = pipeline_stats_result,
530 .result_resource = pipeline_stats_result_resource,
531 };
532
533 static const struct fd_acc_sample_provider pipeline_statistics_single = {
534 .query_type = PIPE_QUERY_PIPELINE_STATISTICS_SINGLE,
535 .size = sizeof(struct fd6_pipeline_stats_sample),
536 .resume = pipeline_stats_resume,
537 .pause = pipeline_stats_pause,
538 .result = pipeline_stats_result,
539 .result_resource = pipeline_stats_result_resource,
540 };
541
542 struct PACKED fd6_primitives_sample {
543 struct fd_acc_query_sample base;
544
545 /* VPC_SO_STREAM_COUNTS dest address must be 32b aligned: */
546 uint64_t pad[3];
547
548 struct {
549 uint64_t emitted, generated;
550 } start[4], stop[4], result;
551 };
552 DEFINE_CAST(fd_acc_query_sample, fd6_primitives_sample);
553
554 #define primitives_reloc(ring, aq, field) \
555 OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \
556 __offsetof(struct fd6_primitives_sample, field), 0, 0);
557
558 static void
log_primitives_sample(struct fd6_primitives_sample * ps)559 log_primitives_sample(struct fd6_primitives_sample *ps)
560 {
561 #ifdef DEBUG_COUNTERS
562 mesa_logd(" so counts");
563 for (int i = 0; i < ARRAY_SIZE(ps->start); i++) {
564 mesa_logd(" CHANNEL %d emitted\t0x%016" PRIx64 "\t0x%016" PRIx64
565 "\t%" PRIi64,
566 i, ps->start[i].generated, ps->stop[i].generated,
567 ps->stop[i].generated - ps->start[i].generated);
568 mesa_logd(" CHANNEL %d generated\t0x%016" PRIx64 "\t0x%016" PRIx64
569 "\t%" PRIi64,
570 i, ps->start[i].emitted, ps->stop[i].emitted,
571 ps->stop[i].emitted - ps->start[i].emitted);
572 }
573
574 mesa_logd("generated %" PRIu64 ", emitted %" PRIu64, ps->result.generated,
575 ps->result.emitted);
576 #endif
577 }
578
579 static void
primitives_emitted_resume(struct fd_acc_query * aq,struct fd_batch * batch)580 primitives_emitted_resume(struct fd_acc_query *aq,
581 struct fd_batch *batch) assert_dt
582 {
583 struct fd_ringbuffer *ring = batch->draw;
584
585 OUT_WFI5(ring);
586
587 ASSERT_ALIGNED(struct fd6_primitives_sample, start[0], 32);
588
589 OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2);
590 primitives_reloc(ring, aq, start[0]);
591
592 fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false);
593 }
594
595 static void
accumultate_primitives_emitted(struct fd_acc_query * aq,struct fd_ringbuffer * ring,int idx)596 accumultate_primitives_emitted(struct fd_acc_query *aq,
597 struct fd_ringbuffer *ring,
598 int idx)
599 {
600 /* result += stop - start: */
601 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
602 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
603 primitives_reloc(ring, aq, result.emitted);
604 primitives_reloc(ring, aq, result.emitted);
605 primitives_reloc(ring, aq, stop[idx].emitted);
606 primitives_reloc(ring, aq, start[idx].emitted);
607 }
608
609 static void
accumultate_primitives_generated(struct fd_acc_query * aq,struct fd_ringbuffer * ring,int idx)610 accumultate_primitives_generated(struct fd_acc_query *aq,
611 struct fd_ringbuffer *ring,
612 int idx)
613 {
614 /* result += stop - start: */
615 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
616 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
617 primitives_reloc(ring, aq, result.generated);
618 primitives_reloc(ring, aq, result.generated);
619 primitives_reloc(ring, aq, stop[idx].generated);
620 primitives_reloc(ring, aq, start[idx].generated);
621 }
622
623 static void
primitives_emitted_pause(struct fd_acc_query * aq,struct fd_batch * batch)624 primitives_emitted_pause(struct fd_acc_query *aq,
625 struct fd_batch *batch) assert_dt
626 {
627 struct fd_ringbuffer *ring = batch->draw;
628
629 OUT_WFI5(ring);
630
631 ASSERT_ALIGNED(struct fd6_primitives_sample, stop[0], 32);
632
633 OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2);
634 primitives_reloc(ring, aq, stop[0]);
635
636 fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false);
637
638 fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true);
639
640 if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
641 /* Need results from all channels: */
642 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
643 accumultate_primitives_emitted(aq, ring, i);
644 accumultate_primitives_generated(aq, ring, i);
645 }
646 } else {
647 accumultate_primitives_emitted(aq, ring, aq->base.index);
648 /* Only need primitives generated counts for the overflow queries: */
649 if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE)
650 accumultate_primitives_generated(aq, ring, aq->base.index);
651 }
652 }
653
654 static void
primitives_emitted_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)655 primitives_emitted_result(struct fd_acc_query *aq,
656 struct fd_acc_query_sample *s,
657 union pipe_query_result *result)
658 {
659 struct fd6_primitives_sample *ps = fd6_primitives_sample(s);
660
661 log_primitives_sample(ps);
662
663 result->u64 = ps->result.emitted;
664 }
665
666 static void
primitives_emitted_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)667 primitives_emitted_result_resource(struct fd_acc_query *aq,
668 struct fd_ringbuffer *ring,
669 enum pipe_query_value_type result_type,
670 int index, struct fd_resource *dst,
671 unsigned offset)
672 {
673 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
674 offsetof(struct fd6_primitives_sample, result.emitted));
675 }
676
677 static void
so_overflow_predicate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)678 so_overflow_predicate_result(struct fd_acc_query *aq,
679 struct fd_acc_query_sample *s,
680 union pipe_query_result *result)
681 {
682 struct fd6_primitives_sample *ps = fd6_primitives_sample(s);
683
684 log_primitives_sample(ps);
685
686 result->b = ps->result.emitted != ps->result.generated;
687 }
688
689 static void
so_overflow_predicate_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)690 so_overflow_predicate_result_resource(struct fd_acc_query *aq,
691 struct fd_ringbuffer *ring,
692 enum pipe_query_value_type result_type,
693 int index, struct fd_resource *dst,
694 unsigned offset)
695 {
696 fd_ringbuffer_attach_bo(ring, dst->bo);
697 fd_ringbuffer_attach_bo(ring, fd_resource(aq->prsc)->bo);
698
699 /* result = generated - emitted: */
700 OUT_PKT7(ring, CP_MEM_TO_MEM, 7);
701 OUT_RING(ring, CP_MEM_TO_MEM_0_NEG_B |
702 COND(result_type >= PIPE_QUERY_TYPE_I64, CP_MEM_TO_MEM_0_DOUBLE));
703 OUT_RELOC(ring, dst->bo, offset, 0, 0);
704 primitives_reloc(ring, aq, result.generated);
705 primitives_reloc(ring, aq, result.emitted);
706
707 /* This is a bit awkward, but glcts expects the result to be 1 or 0
708 * rather than non-zero vs zero:
709 */
710 OUT_PKT7(ring, CP_COND_WRITE5, 9);
711 OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) |
712 CP_COND_WRITE5_0_POLL(POLL_MEMORY) |
713 CP_COND_WRITE5_0_WRITE_MEMORY);
714 OUT_RELOC(ring, dst->bo, offset, 0, 0); /* POLL_ADDR_LO/HI */
715 OUT_RING(ring, CP_COND_WRITE5_3_REF(0));
716 OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
717 OUT_RELOC(ring, dst->bo, offset, 0, 0); /* WRITE_ADDR_LO/HI */
718 OUT_RING(ring, 1);
719 OUT_RING(ring, 0);
720 }
721
722 static const struct fd_acc_sample_provider primitives_emitted = {
723 .query_type = PIPE_QUERY_PRIMITIVES_EMITTED,
724 .size = sizeof(struct fd6_primitives_sample),
725 .resume = primitives_emitted_resume,
726 .pause = primitives_emitted_pause,
727 .result = primitives_emitted_result,
728 .result_resource = primitives_emitted_result_resource,
729 };
730
731 static const struct fd_acc_sample_provider so_overflow_any_predicate = {
732 .query_type = PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE,
733 .size = sizeof(struct fd6_primitives_sample),
734 .resume = primitives_emitted_resume,
735 .pause = primitives_emitted_pause,
736 .result = so_overflow_predicate_result,
737 .result_resource = so_overflow_predicate_result_resource,
738 };
739
740 static const struct fd_acc_sample_provider so_overflow_predicate = {
741 .query_type = PIPE_QUERY_SO_OVERFLOW_PREDICATE,
742 .size = sizeof(struct fd6_primitives_sample),
743 .resume = primitives_emitted_resume,
744 .pause = primitives_emitted_pause,
745 .result = so_overflow_predicate_result,
746 .result_resource = so_overflow_predicate_result_resource,
747 };
748
749 /*
750 * Performance Counter (batch) queries:
751 *
752 * Only one of these is active at a time, per design of the gallium
753 * batch_query API design. On perfcntr query tracks N query_types,
754 * each of which has a 'fd_batch_query_entry' that maps it back to
755 * the associated group and counter.
756 */
757
758 struct fd_batch_query_entry {
759 uint8_t gid; /* group-id */
760 uint8_t cid; /* countable-id within the group */
761 };
762
763 struct fd_batch_query_data {
764 struct fd_screen *screen;
765 unsigned num_query_entries;
766 struct fd_batch_query_entry query_entries[];
767 };
768
769 static void
perfcntr_resume(struct fd_acc_query * aq,struct fd_batch * batch)770 perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
771 {
772 struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data;
773 struct fd_screen *screen = data->screen;
774 struct fd_ringbuffer *ring = batch->draw;
775
776 unsigned counters_per_group[screen->num_perfcntr_groups];
777 memset(counters_per_group, 0, sizeof(counters_per_group));
778
779 OUT_WFI5(ring);
780
781 /* configure performance counters for the requested queries: */
782 for (unsigned i = 0; i < data->num_query_entries; i++) {
783 struct fd_batch_query_entry *entry = &data->query_entries[i];
784 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
785 unsigned counter_idx = counters_per_group[entry->gid]++;
786
787 assert(counter_idx < g->num_counters);
788
789 OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
790 OUT_RING(ring, g->countables[entry->cid].selector);
791 }
792
793 memset(counters_per_group, 0, sizeof(counters_per_group));
794
795 /* and snapshot the start values */
796 for (unsigned i = 0; i < data->num_query_entries; i++) {
797 struct fd_batch_query_entry *entry = &data->query_entries[i];
798 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
799 unsigned counter_idx = counters_per_group[entry->gid]++;
800 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
801
802 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
803 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
804 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
805 OUT_RELOC(ring, query_sample_idx(aq, i, start));
806 }
807 }
808
809 static void
perfcntr_pause(struct fd_acc_query * aq,struct fd_batch * batch)810 perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
811 {
812 struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data;
813 struct fd_screen *screen = data->screen;
814 struct fd_ringbuffer *ring = batch->draw;
815
816 unsigned counters_per_group[screen->num_perfcntr_groups];
817 memset(counters_per_group, 0, sizeof(counters_per_group));
818
819 OUT_WFI5(ring);
820
821 /* TODO do we need to bother to turn anything off? */
822
823 /* snapshot the end values: */
824 for (unsigned i = 0; i < data->num_query_entries; i++) {
825 struct fd_batch_query_entry *entry = &data->query_entries[i];
826 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
827 unsigned counter_idx = counters_per_group[entry->gid]++;
828 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
829
830 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
831 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
832 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
833 OUT_RELOC(ring, query_sample_idx(aq, i, stop));
834 }
835
836 /* and compute the result: */
837 for (unsigned i = 0; i < data->num_query_entries; i++) {
838 /* result += stop - start: */
839 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
840 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
841 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */
842 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */
843 OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */
844 OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */
845 }
846 }
847
848 static void
perfcntr_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)849 perfcntr_accumulate_result(struct fd_acc_query *aq,
850 struct fd_acc_query_sample *s,
851 union pipe_query_result *result)
852 {
853 struct fd_batch_query_data *data =
854 (struct fd_batch_query_data *)aq->query_data;
855 struct fd6_query_sample *sp = fd6_query_sample(s);
856
857 for (unsigned i = 0; i < data->num_query_entries; i++) {
858 result->batch[i].u64 = sp[i].result;
859 }
860 }
861
862 static const struct fd_acc_sample_provider perfcntr = {
863 .query_type = FD_QUERY_FIRST_PERFCNTR,
864 .always = true,
865 .resume = perfcntr_resume,
866 .pause = perfcntr_pause,
867 .result = perfcntr_accumulate_result,
868 };
869
870 static struct pipe_query *
fd6_create_batch_query(struct pipe_context * pctx,unsigned num_queries,unsigned * query_types)871 fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
872 unsigned *query_types)
873 {
874 struct fd_context *ctx = fd_context(pctx);
875 struct fd_screen *screen = ctx->screen;
876 struct fd_query *q;
877 struct fd_acc_query *aq;
878 struct fd_batch_query_data *data;
879
880 data = CALLOC_VARIANT_LENGTH_STRUCT(
881 fd_batch_query_data, num_queries * sizeof(data->query_entries[0]));
882
883 data->screen = screen;
884 data->num_query_entries = num_queries;
885
886 /* validate the requested query_types and ensure we don't try
887 * to request more query_types of a given group than we have
888 * counters:
889 */
890 unsigned counters_per_group[screen->num_perfcntr_groups];
891 memset(counters_per_group, 0, sizeof(counters_per_group));
892
893 for (unsigned i = 0; i < num_queries; i++) {
894 unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
895
896 /* verify valid query_type, ie. is it actually a perfcntr? */
897 if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
898 (idx >= screen->num_perfcntr_queries)) {
899 mesa_loge("invalid batch query query_type: %u", query_types[i]);
900 goto error;
901 }
902
903 struct fd_batch_query_entry *entry = &data->query_entries[i];
904 struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
905
906 entry->gid = pq->group_id;
907
908 /* the perfcntr_queries[] table flattens all the countables
909 * for each group in series, ie:
910 *
911 * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
912 *
913 * So to find the countable index just step back through the
914 * table to find the first entry with the same group-id.
915 */
916 while (pq > screen->perfcntr_queries) {
917 pq--;
918 if (pq->group_id == entry->gid)
919 entry->cid++;
920 }
921
922 if (counters_per_group[entry->gid] >=
923 screen->perfcntr_groups[entry->gid].num_counters) {
924 mesa_loge("too many counters for group %u", entry->gid);
925 goto error;
926 }
927
928 counters_per_group[entry->gid]++;
929 }
930
931 q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
932 aq = fd_acc_query(q);
933
934 /* sample buffer size is based on # of queries: */
935 aq->size = num_queries * sizeof(struct fd6_query_sample);
936 aq->query_data = data;
937
938 return (struct pipe_query *)q;
939
940 error:
941 free(data);
942 return NULL;
943 }
944
945 void
fd6_query_context_init(struct pipe_context * pctx)946 fd6_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
947 {
948 struct fd_context *ctx = fd_context(pctx);
949
950 ctx->create_query = fd_acc_create_query;
951 ctx->query_update_batch = fd_acc_query_update_batch;
952
953 ctx->record_timestamp = record_timestamp;
954 ctx->ts_to_ns = ticks_to_ns;
955
956 pctx->create_batch_query = fd6_create_batch_query;
957
958 fd_acc_query_register_provider(pctx, &occlusion_counter);
959 fd_acc_query_register_provider(pctx, &occlusion_predicate);
960 fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative);
961
962 fd_acc_query_register_provider(pctx, &time_elapsed);
963 fd_acc_query_register_provider(pctx, ×tamp);
964
965 fd_acc_query_register_provider(pctx, &primitives_generated);
966 fd_acc_query_register_provider(pctx, &pipeline_statistics_single);
967
968 fd_acc_query_register_provider(pctx, &primitives_emitted);
969 fd_acc_query_register_provider(pctx, &so_overflow_any_predicate);
970 fd_acc_query_register_provider(pctx, &so_overflow_predicate);
971 }
972