1 /*
2 * Copyright © 2017 Rob Clark <robclark@freedesktop.org>
3 * Copyright © 2018 Google, Inc.
4 * SPDX-License-Identifier: MIT
5 *
6 * Authors:
7 * Rob Clark <robclark@freedesktop.org>
8 */
9
10 #define FD_BO_NO_HARDPIN 1
11
12 /* NOTE: see https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A5xx-Queries */
13
14 #include "freedreno_query_acc.h"
15 #include "freedreno_resource.h"
16
17 #include "fd6_context.h"
18 #include "fd6_emit.h"
19 #include "fd6_query.h"
20
21 #include "fd6_pack.h"
22
23 /* g++ is a picky about offsets that cannot be resolved at compile time, so
24 * roll our own __offsetof()
25 */
26 #define __offsetof(type, field) \
27 ({ type _x = {}; ((uint8_t *)&_x.field) - ((uint8_t *)&_x);})
28
29 struct PACKED fd6_query_sample {
30 struct fd_acc_query_sample base;
31
32 /* The RB_SAMPLE_COUNT_ADDR destination needs to be 16-byte aligned: */
33 uint64_t pad;
34
35 uint64_t start;
36 uint64_t result;
37 uint64_t stop;
38 };
39 FD_DEFINE_CAST(fd_acc_query_sample, fd6_query_sample);
40
41 /* offset of a single field of an array of fd6_query_sample: */
42 #define query_sample_idx(aq, idx, field) \
43 fd_resource((aq)->prsc)->bo, \
44 (idx * sizeof(struct fd6_query_sample)) + \
45 offsetof(struct fd6_query_sample, field), \
46 0, 0
47
48 /* offset of a single field of fd6_query_sample: */
49 #define query_sample(aq, field) query_sample_idx(aq, 0, field)
50
51 /*
52 * Occlusion Query:
53 *
54 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
55 * interpret results
56 */
57
58 template <chip CHIP>
59 static void
occlusion_resume(struct fd_acc_query * aq,struct fd_batch * batch)60 occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
61 {
62 struct fd_context *ctx = batch->ctx;
63 struct fd_ringbuffer *ring = batch->draw;
64
65 ASSERT_ALIGNED(struct fd6_query_sample, start, 16);
66
67 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
68 OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
69
70 if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
71 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
72 OUT_RELOC(ring, query_sample(aq, start));
73
74 fd6_event_write<CHIP>(ctx, ring, FD_ZPASS_DONE);
75
76 /* Copied from blob's cmdstream, not sure why it is done. */
77 if (CHIP == A7XX) {
78 fd6_event_write<CHIP>(ctx, ring, FD_CCU_CLEAN_DEPTH);
79 }
80 } else {
81 OUT_PKT(ring, CP_EVENT_WRITE7,
82 CP_EVENT_WRITE7_0(
83 .event = ZPASS_DONE,
84 .write_sample_count = true,
85 ),
86 EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)),
87 );
88 OUT_PKT(ring, CP_EVENT_WRITE7,
89 CP_EVENT_WRITE7_0(
90 .event = ZPASS_DONE,
91 .write_sample_count = true,
92 .sample_count_end_offset = true,
93 .write_accum_sample_count_diff = true,
94 ),
95 EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)),
96 );
97 }
98
99 ctx->occlusion_queries_active++;
100
101 /* Just directly bash the gen specific LRZ dirty bit, since we don't
102 * need to re-emit any other LRZ related state:
103 */
104 ctx->gen_dirty |= FD6_GROUP_LRZ;
105 }
106
107 template <chip CHIP>
108 static void
occlusion_pause(struct fd_acc_query * aq,struct fd_batch * batch)109 occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
110 {
111 struct fd_context *ctx = batch->ctx;
112 struct fd_ringbuffer *ring = batch->draw;
113
114 if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
115 OUT_PKT7(ring, CP_MEM_WRITE, 4);
116 OUT_RELOC(ring, query_sample(aq, stop));
117 OUT_RING(ring, 0xffffffff);
118 OUT_RING(ring, 0xffffffff);
119
120 OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
121 }
122
123 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
124 OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
125
126 ASSERT_ALIGNED(struct fd6_query_sample, stop, 16);
127
128 if (!ctx->screen->info->a7xx.has_event_write_sample_count) {
129 OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
130 OUT_RELOC(ring, query_sample(aq, stop));
131
132 fd6_event_write<CHIP>(batch->ctx, ring, FD_ZPASS_DONE);
133
134 /* To avoid stalling in the draw buffer, emit code the code to compute the
135 * counter delta in the epilogue ring.
136 */
137 struct fd_ringbuffer *epilogue = fd_batch_get_tile_epilogue(batch);
138
139 OUT_PKT7(epilogue, CP_WAIT_REG_MEM, 6);
140 OUT_RING(epilogue, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
141 CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
142 OUT_RELOC(epilogue, query_sample(aq, stop));
143 OUT_RING(epilogue, CP_WAIT_REG_MEM_3_REF(0xffffffff));
144 OUT_RING(epilogue, CP_WAIT_REG_MEM_4_MASK(0xffffffff));
145 OUT_RING(epilogue, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
146
147 /* result += stop - start: */
148 OUT_PKT7(epilogue, CP_MEM_TO_MEM, 9);
149 OUT_RING(epilogue, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
150 OUT_RELOC(epilogue, query_sample(aq, result)); /* dst */
151 OUT_RELOC(epilogue, query_sample(aq, result)); /* srcA */
152 OUT_RELOC(epilogue, query_sample(aq, stop)); /* srcB */
153 OUT_RELOC(epilogue, query_sample(aq, start)); /* srcC */
154 } else {
155 OUT_PKT(ring, CP_EVENT_WRITE7,
156 CP_EVENT_WRITE7_0(
157 .event = ZPASS_DONE,
158 .write_sample_count = true,
159 ),
160 EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, stop)),
161 );
162 OUT_PKT(ring, CP_EVENT_WRITE7,
163 CP_EVENT_WRITE7_0(
164 .event = ZPASS_DONE,
165 .write_sample_count = true,
166 .sample_count_end_offset = true,
167 .write_accum_sample_count_diff = true,
168 ),
169 /* Note: SQE is adding offsets to the iova, SAMPLE_COUNT_END_OFFSET causes
170 * the result to be written to iova+16, and WRITE_ACCUM_SAMP_COUNT_DIFF
171 * does *(iova + 8) += *(iova + 16) - *iova
172 *
173 * It just so happens this is the layout we already to for start/result/stop
174 * So we just give the start address in all cases.
175 */
176 EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)),
177 );
178 }
179
180 assert(ctx->occlusion_queries_active > 0);
181 ctx->occlusion_queries_active--;
182
183 /* Just directly bash the gen specific LRZ dirty bit, since we don't
184 * need to re-emit any other LRZ related state:
185 */
186 ctx->gen_dirty |= FD6_GROUP_LRZ;
187 }
188
189 static void
occlusion_counter_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)190 occlusion_counter_result(struct fd_acc_query *aq,
191 struct fd_acc_query_sample *s,
192 union pipe_query_result *result)
193 {
194 struct fd6_query_sample *sp = fd6_query_sample(s);
195 result->u64 = sp->result;
196 }
197
198 static void
occlusion_counter_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)199 occlusion_counter_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
200 enum pipe_query_value_type result_type,
201 int index, struct fd_resource *dst,
202 unsigned offset)
203 {
204 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
205 offsetof(struct fd6_query_sample, result));
206 }
207
208 static void
occlusion_predicate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)209 occlusion_predicate_result(struct fd_acc_query *aq,
210 struct fd_acc_query_sample *s,
211 union pipe_query_result *result)
212 {
213 struct fd6_query_sample *sp = fd6_query_sample(s);
214 result->b = !!sp->result;
215 }
216
217 static void
occlusion_predicate_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)218 occlusion_predicate_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
219 enum pipe_query_value_type result_type,
220 int index, struct fd_resource *dst,
221 unsigned offset)
222 {
223 /* This is a bit annoying but we need to turn the result into a one or
224 * zero.. to do this use a CP_COND_WRITE to overwrite the result with
225 * a one if it is non-zero. This doesn't change the results if the
226 * query is also read on the CPU (ie. occlusion_predicate_result()).
227 */
228 OUT_PKT7(ring, CP_COND_WRITE5, 9);
229 OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) |
230 CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY) |
231 CP_COND_WRITE5_0_WRITE_MEMORY);
232 OUT_RELOC(ring, query_sample(aq, result)); /* POLL_ADDR_LO/HI */
233 OUT_RING(ring, CP_COND_WRITE5_3_REF(0));
234 OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
235 OUT_RELOC(ring, query_sample(aq, result)); /* WRITE_ADDR_LO/HI */
236 OUT_RING(ring, 1);
237 OUT_RING(ring, 0);
238
239 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
240 offsetof(struct fd6_query_sample, result));
241 }
242
243 template <chip CHIP>
244 static const struct fd_acc_sample_provider occlusion_counter = {
245 .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
246 .size = sizeof(struct fd6_query_sample),
247 .resume = occlusion_resume<CHIP>,
248 .pause = occlusion_pause<CHIP>,
249 .result = occlusion_counter_result,
250 .result_resource = occlusion_counter_result_resource,
251 };
252
253 template <chip CHIP>
254 static const struct fd_acc_sample_provider occlusion_predicate = {
255 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
256 .size = sizeof(struct fd6_query_sample),
257 .resume = occlusion_resume<CHIP>,
258 .pause = occlusion_pause<CHIP>,
259 .result = occlusion_predicate_result,
260 .result_resource = occlusion_predicate_result_resource,
261 };
262
263 template <chip CHIP>
264 static const struct fd_acc_sample_provider occlusion_predicate_conservative = {
265 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
266 .size = sizeof(struct fd6_query_sample),
267 .resume = occlusion_resume<CHIP>,
268 .pause = occlusion_pause<CHIP>,
269 .result = occlusion_predicate_result,
270 .result_resource = occlusion_predicate_result_resource,
271 };
272
273 /*
274 * Timestamp Queries:
275 */
276
277 template <chip CHIP>
278 static void
timestamp_resume(struct fd_acc_query * aq,struct fd_batch * batch)279 timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch)
280 {
281 struct fd_ringbuffer *ring = batch->draw;
282
283 fd6_record_ts<CHIP>(ring, query_sample(aq, start));
284 }
285
286 template <chip CHIP>
287 static void
time_elapsed_pause(struct fd_acc_query * aq,struct fd_batch * batch)288 time_elapsed_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
289 {
290 struct fd_ringbuffer *ring = batch->draw;
291
292 fd6_record_ts<CHIP>(ring, query_sample(aq, stop));
293
294 OUT_WFI5(ring);
295
296 /* result += stop - start: */
297 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
298 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
299 OUT_RELOC(ring, query_sample(aq, result)); /* dst */
300 OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
301 OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */
302 OUT_RELOC(ring, query_sample(aq, start)); /* srcC */
303 }
304
305 static void
timestamp_pause(struct fd_acc_query * aq,struct fd_batch * batch)306 timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch)
307 {
308 /* We captured a timestamp in timestamp_resume(), nothing to do here. */
309 }
310
311 /* timestamp logging for u_trace: */
312 template <chip CHIP>
313 static void
record_timestamp(struct fd_ringbuffer * ring,struct fd_bo * bo,unsigned offset)314 record_timestamp(struct fd_ringbuffer *ring, struct fd_bo *bo, unsigned offset)
315 {
316 fd_ringbuffer_attach_bo(ring, bo);
317 fd6_record_ts<CHIP>(ring, bo, offset, 0, 0);
318 }
319
320 static void
time_elapsed_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)321 time_elapsed_accumulate_result(struct fd_acc_query *aq,
322 struct fd_acc_query_sample *s,
323 union pipe_query_result *result)
324 {
325 struct fd6_query_sample *sp = fd6_query_sample(s);
326 result->u64 = ticks_to_ns(sp->result);
327 }
328
329 static void
time_elapsed_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)330 time_elapsed_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
331 enum pipe_query_value_type result_type,
332 int index, struct fd_resource *dst,
333 unsigned offset)
334 {
335 // TODO ticks_to_ns conversion would require spinning up a compute shader?
336 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
337 offsetof(struct fd6_query_sample, result));
338 }
339
340 static void
timestamp_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)341 timestamp_accumulate_result(struct fd_acc_query *aq,
342 struct fd_acc_query_sample *s,
343 union pipe_query_result *result)
344 {
345 struct fd6_query_sample *sp = fd6_query_sample(s);
346 result->u64 = ticks_to_ns(sp->start);
347 }
348
349 static void
timestamp_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)350 timestamp_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
351 enum pipe_query_value_type result_type,
352 int index, struct fd_resource *dst,
353 unsigned offset)
354 {
355 // TODO ticks_to_ns conversion would require spinning up a compute shader?
356 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
357 offsetof(struct fd6_query_sample, start));
358 }
359
360 template <chip CHIP>
361 static const struct fd_acc_sample_provider time_elapsed = {
362 .query_type = PIPE_QUERY_TIME_ELAPSED,
363 .always = true,
364 .size = sizeof(struct fd6_query_sample),
365 .resume = timestamp_resume<CHIP>,
366 .pause = time_elapsed_pause<CHIP>,
367 .result = time_elapsed_accumulate_result,
368 .result_resource = time_elapsed_result_resource,
369 };
370
371 /* NOTE: timestamp query isn't going to give terribly sensible results
372 * on a tiler. But it is needed by qapitrace profile heatmap. If you
373 * add in a binning pass, the results get even more non-sensical. So
374 * we just return the timestamp on the last tile and hope that is
375 * kind of good enough.
376 */
377
378 template <chip CHIP>
379 static const struct fd_acc_sample_provider timestamp = {
380 .query_type = PIPE_QUERY_TIMESTAMP,
381 .always = true,
382 .size = sizeof(struct fd6_query_sample),
383 .resume = timestamp_resume<CHIP>,
384 .pause = timestamp_pause,
385 .result = timestamp_accumulate_result,
386 .result_resource = timestamp_result_resource,
387 };
388
389 struct PACKED fd6_pipeline_stats_sample {
390 struct fd_acc_query_sample base;
391
392 uint64_t start, stop, result;
393 };
394 FD_DEFINE_CAST(fd_acc_query_sample, fd6_pipeline_stats_sample);
395
396 #define stats_reloc(ring, aq, field) \
397 OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \
398 offsetof(struct fd6_pipeline_stats_sample, field), 0, 0);
399
400 /* Mapping of counters to pipeline stats:
401 *
402 * Gallium (PIPE_STAT_QUERY_x) | Vulkan (VK_QUERY_PIPELINE_STATISTIC_x_BIT) | hw counter
403 * ----------------------------+--------------------------------------------+----------------
404 * IA_VERTICES | INPUT_ASSEMBLY_VERTICES | RBBM_PRIMCTR_0
405 * IA_PRIMITIVES | INPUT_ASSEMBLY_PRIMITIVES | RBBM_PRIMCTR_1
406 * VS_INVOCATIONS | VERTEX_SHADER_INVOCATIONS | RBBM_PRIMCTR_2
407 * GS_INVOCATIONS | GEOMETRY_SHADER_INVOCATIONS | RBBM_PRIMCTR_5
408 * GS_PRIMITIVES | GEOMETRY_SHADER_PRIMITIVES | RBBM_PRIMCTR_6
409 * C_INVOCATIONS | CLIPPING_INVOCATIONS | RBBM_PRIMCTR_7
410 * C_PRIMITIVES | CLIPPING_PRIMITIVES | RBBM_PRIMCTR_8
411 * PS_INVOCATIONS | FRAGMENT_SHADER_INVOCATIONS | RBBM_PRIMCTR_9
412 * HS_INVOCATIONS | TESSELLATION_CONTROL_SHADER_PATCHES | RBBM_PRIMCTR_3
413 * DS_INVOCATIONS | TESSELLATION_EVALUATION_SHADER_INVOCATIONS | RBBM_PRIMCTR_4
414 * CS_INVOCATIONS | COMPUTE_SHADER_INVOCATIONS | RBBM_PRIMCTR_10
415 */
416
417 enum stats_type {
418 STATS_PRIMITIVE,
419 STATS_FRAGMENT,
420 STATS_COMPUTE,
421 };
422
423 static const struct {
424 enum fd_gpu_event start, stop;
425 } stats_counter_events[] = {
426 [STATS_PRIMITIVE] = { FD_START_PRIMITIVE_CTRS, FD_STOP_PRIMITIVE_CTRS },
427 [STATS_FRAGMENT] = { FD_START_FRAGMENT_CTRS, FD_STOP_FRAGMENT_CTRS },
428 [STATS_COMPUTE] = { FD_START_COMPUTE_CTRS, FD_STOP_COMPUTE_CTRS },
429 };
430
431 static enum stats_type
get_stats_type(struct fd_acc_query * aq)432 get_stats_type(struct fd_acc_query *aq)
433 {
434 if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
435 return STATS_PRIMITIVE;
436
437 switch (aq->base.index) {
438 case PIPE_STAT_QUERY_PS_INVOCATIONS: return STATS_FRAGMENT;
439 case PIPE_STAT_QUERY_CS_INVOCATIONS: return STATS_COMPUTE;
440 default:
441 return STATS_PRIMITIVE;
442 }
443 }
444
445 static unsigned
stats_counter_index(struct fd_acc_query * aq)446 stats_counter_index(struct fd_acc_query *aq)
447 {
448 if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
449 return 7;
450
451 switch (aq->base.index) {
452 case PIPE_STAT_QUERY_IA_VERTICES: return 0;
453 case PIPE_STAT_QUERY_IA_PRIMITIVES: return 1;
454 case PIPE_STAT_QUERY_VS_INVOCATIONS: return 2;
455 case PIPE_STAT_QUERY_GS_INVOCATIONS: return 5;
456 case PIPE_STAT_QUERY_GS_PRIMITIVES: return 6;
457 case PIPE_STAT_QUERY_C_INVOCATIONS: return 7;
458 case PIPE_STAT_QUERY_C_PRIMITIVES: return 8;
459 case PIPE_STAT_QUERY_PS_INVOCATIONS: return 9;
460 case PIPE_STAT_QUERY_HS_INVOCATIONS: return 3;
461 case PIPE_STAT_QUERY_DS_INVOCATIONS: return 4;
462 case PIPE_STAT_QUERY_CS_INVOCATIONS: return 10;
463 default:
464 return 0;
465 }
466 }
467
468 static void
log_pipeline_stats(struct fd6_pipeline_stats_sample * ps,unsigned idx)469 log_pipeline_stats(struct fd6_pipeline_stats_sample *ps, unsigned idx)
470 {
471 #ifdef DEBUG_COUNTERS
472 const char *labels[] = {
473 "IA_VERTICES",
474 "IA_PRIMITIVES",
475 "VS_INVOCATIONS",
476 "HS_INVOCATIONS",
477 "DS_INVOCATIONS",
478 "GS_INVOCATIONS",
479 "GS_PRIMITIVES",
480 "C_INVOCATIONS",
481 "C_PRIMITIVES",
482 "PS_INVOCATIONS",
483 "CS_INVOCATIONS",
484 };
485
486 mesa_logd(" counter\t\tstart\t\t\tstop\t\t\tdiff");
487 mesa_logd(" RBBM_PRIMCTR_%d\t0x%016" PRIx64 "\t0x%016" PRIx64 "\t%" PRIi64 "\t%s",
488 idx, ps->start, ps->stop, ps->stop - ps->start, labels[idx]);
489 #endif
490 }
491
492 template <chip CHIP>
493 static void
pipeline_stats_resume(struct fd_acc_query * aq,struct fd_batch * batch)494 pipeline_stats_resume(struct fd_acc_query *aq, struct fd_batch *batch)
495 assert_dt
496 {
497 struct fd_ringbuffer *ring = batch->draw;
498 enum stats_type type = get_stats_type(aq);
499 unsigned idx = stats_counter_index(aq);
500 unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
501
502 OUT_WFI5(ring);
503
504 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
505 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
506 CP_REG_TO_MEM_0_CNT(2) |
507 CP_REG_TO_MEM_0_REG(reg));
508 stats_reloc(ring, aq, start);
509
510 assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
511
512 if (!batch->pipeline_stats_queries_active[type])
513 fd6_event_write<CHIP>(batch->ctx, ring, stats_counter_events[type].start);
514 batch->pipeline_stats_queries_active[type]++;
515 }
516
517 template <chip CHIP>
518 static void
pipeline_stats_pause(struct fd_acc_query * aq,struct fd_batch * batch)519 pipeline_stats_pause(struct fd_acc_query *aq, struct fd_batch *batch)
520 assert_dt
521 {
522 struct fd_ringbuffer *ring = batch->draw;
523 enum stats_type type = get_stats_type(aq);
524 unsigned idx = stats_counter_index(aq);
525 unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
526
527 OUT_WFI5(ring);
528
529 /* snapshot the end values: */
530 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
531 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
532 CP_REG_TO_MEM_0_CNT(2) |
533 CP_REG_TO_MEM_0_REG(reg));
534 stats_reloc(ring, aq, stop);
535
536 assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
537 assert(batch->pipeline_stats_queries_active[type] > 0);
538
539 batch->pipeline_stats_queries_active[type]--;
540 if (batch->pipeline_stats_queries_active[type])
541 fd6_event_write<CHIP>(batch->ctx, ring, stats_counter_events[type].stop);
542
543 /* result += stop - start: */
544 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
545 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x40000000);
546 stats_reloc(ring, aq, result);
547 stats_reloc(ring, aq, result);
548 stats_reloc(ring, aq, stop)
549 stats_reloc(ring, aq, start);
550 }
551
552 static void
pipeline_stats_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)553 pipeline_stats_result(struct fd_acc_query *aq,
554 struct fd_acc_query_sample *s,
555 union pipe_query_result *result)
556 {
557 struct fd6_pipeline_stats_sample *ps = fd6_pipeline_stats_sample(s);
558
559 log_pipeline_stats(ps, stats_counter_index(aq));
560
561 result->u64 = ps->result;
562 }
563
564 static void
pipeline_stats_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)565 pipeline_stats_result_resource(struct fd_acc_query *aq,
566 struct fd_ringbuffer *ring,
567 enum pipe_query_value_type result_type,
568 int index, struct fd_resource *dst,
569 unsigned offset)
570 {
571 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
572 offsetof(struct fd6_pipeline_stats_sample, result));
573 }
574
575 template <chip CHIP>
576 static const struct fd_acc_sample_provider primitives_generated = {
577 .query_type = PIPE_QUERY_PRIMITIVES_GENERATED,
578 .size = sizeof(struct fd6_pipeline_stats_sample),
579 .resume = pipeline_stats_resume<CHIP>,
580 .pause = pipeline_stats_pause<CHIP>,
581 .result = pipeline_stats_result,
582 .result_resource = pipeline_stats_result_resource,
583 };
584
585 template <chip CHIP>
586 static const struct fd_acc_sample_provider pipeline_statistics_single = {
587 .query_type = PIPE_QUERY_PIPELINE_STATISTICS_SINGLE,
588 .size = sizeof(struct fd6_pipeline_stats_sample),
589 .resume = pipeline_stats_resume<CHIP>,
590 .pause = pipeline_stats_pause<CHIP>,
591 .result = pipeline_stats_result,
592 .result_resource = pipeline_stats_result_resource,
593 };
594
595 struct PACKED fd6_primitives_sample {
596 struct fd_acc_query_sample base;
597
598 /* VPC_SO_STREAM_COUNTS dest address must be 32b aligned: */
599 uint64_t pad[3];
600
601 struct {
602 uint64_t emitted, generated;
603 } start[4], stop[4], result;
604 };
605 FD_DEFINE_CAST(fd_acc_query_sample, fd6_primitives_sample);
606
607 #define primitives_reloc(ring, aq, field) \
608 OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \
609 __offsetof(struct fd6_primitives_sample, field), 0, 0);
610
611 static void
log_primitives_sample(struct fd6_primitives_sample * ps)612 log_primitives_sample(struct fd6_primitives_sample *ps)
613 {
614 #ifdef DEBUG_COUNTERS
615 mesa_logd(" so counts");
616 for (int i = 0; i < ARRAY_SIZE(ps->start); i++) {
617 mesa_logd(" CHANNEL %d emitted\t0x%016" PRIx64 "\t0x%016" PRIx64
618 "\t%" PRIi64,
619 i, ps->start[i].generated, ps->stop[i].generated,
620 ps->stop[i].generated - ps->start[i].generated);
621 mesa_logd(" CHANNEL %d generated\t0x%016" PRIx64 "\t0x%016" PRIx64
622 "\t%" PRIi64,
623 i, ps->start[i].emitted, ps->stop[i].emitted,
624 ps->stop[i].emitted - ps->start[i].emitted);
625 }
626
627 mesa_logd("generated %" PRIu64 ", emitted %" PRIu64, ps->result.generated,
628 ps->result.emitted);
629 #endif
630 }
631
632 template <chip CHIP>
633 static void
primitives_emitted_resume(struct fd_acc_query * aq,struct fd_batch * batch)634 primitives_emitted_resume(struct fd_acc_query *aq,
635 struct fd_batch *batch) assert_dt
636 {
637 struct fd_ringbuffer *ring = batch->draw;
638
639 OUT_WFI5(ring);
640
641 ASSERT_ALIGNED(struct fd6_primitives_sample, start[0], 32);
642
643 OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2);
644 primitives_reloc(ring, aq, start[0]);
645
646 fd6_event_write<CHIP>(batch->ctx, ring, FD_WRITE_PRIMITIVE_COUNTS);
647 }
648
649 static void
accumultate_primitives_emitted(struct fd_acc_query * aq,struct fd_ringbuffer * ring,int idx)650 accumultate_primitives_emitted(struct fd_acc_query *aq,
651 struct fd_ringbuffer *ring,
652 int idx)
653 {
654 /* result += stop - start: */
655 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
656 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
657 primitives_reloc(ring, aq, result.emitted);
658 primitives_reloc(ring, aq, result.emitted);
659 primitives_reloc(ring, aq, stop[idx].emitted);
660 primitives_reloc(ring, aq, start[idx].emitted);
661 }
662
663 static void
accumultate_primitives_generated(struct fd_acc_query * aq,struct fd_ringbuffer * ring,int idx)664 accumultate_primitives_generated(struct fd_acc_query *aq,
665 struct fd_ringbuffer *ring,
666 int idx)
667 {
668 /* result += stop - start: */
669 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
670 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
671 primitives_reloc(ring, aq, result.generated);
672 primitives_reloc(ring, aq, result.generated);
673 primitives_reloc(ring, aq, stop[idx].generated);
674 primitives_reloc(ring, aq, start[idx].generated);
675 }
676
677 template <chip CHIP>
678 static void
primitives_emitted_pause(struct fd_acc_query * aq,struct fd_batch * batch)679 primitives_emitted_pause(struct fd_acc_query *aq,
680 struct fd_batch *batch) assert_dt
681 {
682 struct fd_ringbuffer *ring = batch->draw;
683
684 OUT_WFI5(ring);
685
686 ASSERT_ALIGNED(struct fd6_primitives_sample, stop[0], 32);
687
688 OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2);
689 primitives_reloc(ring, aq, stop[0]);
690
691 fd6_event_write<CHIP>(batch->ctx, ring, FD_WRITE_PRIMITIVE_COUNTS);
692 fd6_event_write<CHIP>(batch->ctx, ring, FD_CACHE_CLEAN);
693
694 if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
695 /* Need results from all channels: */
696 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
697 accumultate_primitives_emitted(aq, ring, i);
698 accumultate_primitives_generated(aq, ring, i);
699 }
700 } else {
701 accumultate_primitives_emitted(aq, ring, aq->base.index);
702 /* Only need primitives generated counts for the overflow queries: */
703 if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE)
704 accumultate_primitives_generated(aq, ring, aq->base.index);
705 }
706 }
707
708 static void
primitives_emitted_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)709 primitives_emitted_result(struct fd_acc_query *aq,
710 struct fd_acc_query_sample *s,
711 union pipe_query_result *result)
712 {
713 struct fd6_primitives_sample *ps = fd6_primitives_sample(s);
714
715 log_primitives_sample(ps);
716
717 result->u64 = ps->result.emitted;
718 }
719
720 static void
primitives_emitted_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)721 primitives_emitted_result_resource(struct fd_acc_query *aq,
722 struct fd_ringbuffer *ring,
723 enum pipe_query_value_type result_type,
724 int index, struct fd_resource *dst,
725 unsigned offset)
726 {
727 copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
728 offsetof(struct fd6_primitives_sample, result.emitted));
729 }
730
731 static void
so_overflow_predicate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)732 so_overflow_predicate_result(struct fd_acc_query *aq,
733 struct fd_acc_query_sample *s,
734 union pipe_query_result *result)
735 {
736 struct fd6_primitives_sample *ps = fd6_primitives_sample(s);
737
738 log_primitives_sample(ps);
739
740 result->b = ps->result.emitted != ps->result.generated;
741 }
742
743 static void
so_overflow_predicate_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)744 so_overflow_predicate_result_resource(struct fd_acc_query *aq,
745 struct fd_ringbuffer *ring,
746 enum pipe_query_value_type result_type,
747 int index, struct fd_resource *dst,
748 unsigned offset)
749 {
750 fd_ringbuffer_attach_bo(ring, dst->bo);
751 fd_ringbuffer_attach_bo(ring, fd_resource(aq->prsc)->bo);
752
753 /* result = generated - emitted: */
754 OUT_PKT7(ring, CP_MEM_TO_MEM, 7);
755 OUT_RING(ring, CP_MEM_TO_MEM_0_NEG_B |
756 COND(result_type >= PIPE_QUERY_TYPE_I64, CP_MEM_TO_MEM_0_DOUBLE));
757 OUT_RELOC(ring, dst->bo, offset, 0, 0);
758 primitives_reloc(ring, aq, result.generated);
759 primitives_reloc(ring, aq, result.emitted);
760
761 /* This is a bit awkward, but glcts expects the result to be 1 or 0
762 * rather than non-zero vs zero:
763 */
764 OUT_PKT7(ring, CP_COND_WRITE5, 9);
765 OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) |
766 CP_COND_WRITE5_0_POLL(POLL_MEMORY) |
767 CP_COND_WRITE5_0_WRITE_MEMORY);
768 OUT_RELOC(ring, dst->bo, offset, 0, 0); /* POLL_ADDR_LO/HI */
769 OUT_RING(ring, CP_COND_WRITE5_3_REF(0));
770 OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
771 OUT_RELOC(ring, dst->bo, offset, 0, 0); /* WRITE_ADDR_LO/HI */
772 OUT_RING(ring, 1);
773 OUT_RING(ring, 0);
774 }
775
776 template <chip CHIP>
777 static const struct fd_acc_sample_provider primitives_emitted = {
778 .query_type = PIPE_QUERY_PRIMITIVES_EMITTED,
779 .size = sizeof(struct fd6_primitives_sample),
780 .resume = primitives_emitted_resume<CHIP>,
781 .pause = primitives_emitted_pause<CHIP>,
782 .result = primitives_emitted_result,
783 .result_resource = primitives_emitted_result_resource,
784 };
785
786 template <chip CHIP>
787 static const struct fd_acc_sample_provider so_overflow_any_predicate = {
788 .query_type = PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE,
789 .size = sizeof(struct fd6_primitives_sample),
790 .resume = primitives_emitted_resume<CHIP>,
791 .pause = primitives_emitted_pause<CHIP>,
792 .result = so_overflow_predicate_result,
793 .result_resource = so_overflow_predicate_result_resource,
794 };
795
796 template <chip CHIP>
797 static const struct fd_acc_sample_provider so_overflow_predicate = {
798 .query_type = PIPE_QUERY_SO_OVERFLOW_PREDICATE,
799 .size = sizeof(struct fd6_primitives_sample),
800 .resume = primitives_emitted_resume<CHIP>,
801 .pause = primitives_emitted_pause<CHIP>,
802 .result = so_overflow_predicate_result,
803 .result_resource = so_overflow_predicate_result_resource,
804 };
805
806 /*
807 * Performance Counter (batch) queries:
808 *
809 * Only one of these is active at a time, per design of the gallium
810 * batch_query API design. On perfcntr query tracks N query_types,
811 * each of which has a 'fd_batch_query_entry' that maps it back to
812 * the associated group and counter.
813 */
814
815 struct fd_batch_query_entry {
816 uint8_t gid; /* group-id */
817 uint8_t cid; /* countable-id within the group */
818 };
819
820 struct fd_batch_query_data {
821 struct fd_screen *screen;
822 unsigned num_query_entries;
823 struct fd_batch_query_entry query_entries[];
824 };
825
826 static void
perfcntr_resume(struct fd_acc_query * aq,struct fd_batch * batch)827 perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
828 {
829 struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data;
830 struct fd_screen *screen = data->screen;
831 struct fd_ringbuffer *ring = batch->draw;
832
833 unsigned counters_per_group[screen->num_perfcntr_groups];
834 memset(counters_per_group, 0, sizeof(counters_per_group));
835
836 OUT_WFI5(ring);
837
838 /* configure performance counters for the requested queries: */
839 for (unsigned i = 0; i < data->num_query_entries; i++) {
840 struct fd_batch_query_entry *entry = &data->query_entries[i];
841 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
842 unsigned counter_idx = counters_per_group[entry->gid]++;
843
844 assert(counter_idx < g->num_counters);
845
846 OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
847 OUT_RING(ring, g->countables[entry->cid].selector);
848 }
849
850 memset(counters_per_group, 0, sizeof(counters_per_group));
851
852 /* and snapshot the start values */
853 for (unsigned i = 0; i < data->num_query_entries; i++) {
854 struct fd_batch_query_entry *entry = &data->query_entries[i];
855 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
856 unsigned counter_idx = counters_per_group[entry->gid]++;
857 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
858
859 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
860 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
861 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
862 OUT_RELOC(ring, query_sample_idx(aq, i, start));
863 }
864 }
865
866 static void
perfcntr_pause(struct fd_acc_query * aq,struct fd_batch * batch)867 perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
868 {
869 struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data;
870 struct fd_screen *screen = data->screen;
871 struct fd_ringbuffer *ring = batch->draw;
872
873 unsigned counters_per_group[screen->num_perfcntr_groups];
874 memset(counters_per_group, 0, sizeof(counters_per_group));
875
876 OUT_WFI5(ring);
877
878 /* TODO do we need to bother to turn anything off? */
879
880 /* snapshot the end values: */
881 for (unsigned i = 0; i < data->num_query_entries; i++) {
882 struct fd_batch_query_entry *entry = &data->query_entries[i];
883 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
884 unsigned counter_idx = counters_per_group[entry->gid]++;
885 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
886
887 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
888 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
889 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
890 OUT_RELOC(ring, query_sample_idx(aq, i, stop));
891 }
892
893 /* and compute the result: */
894 for (unsigned i = 0; i < data->num_query_entries; i++) {
895 /* result += stop - start: */
896 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
897 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
898 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */
899 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */
900 OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */
901 OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */
902 }
903 }
904
905 static void
perfcntr_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)906 perfcntr_accumulate_result(struct fd_acc_query *aq,
907 struct fd_acc_query_sample *s,
908 union pipe_query_result *result)
909 {
910 struct fd_batch_query_data *data =
911 (struct fd_batch_query_data *)aq->query_data;
912 struct fd6_query_sample *sp = fd6_query_sample(s);
913
914 for (unsigned i = 0; i < data->num_query_entries; i++) {
915 result->batch[i].u64 = sp[i].result;
916 }
917 }
918
919 static const struct fd_acc_sample_provider perfcntr = {
920 .query_type = FD_QUERY_FIRST_PERFCNTR,
921 .always = true,
922 .resume = perfcntr_resume,
923 .pause = perfcntr_pause,
924 .result = perfcntr_accumulate_result,
925 };
926
927 static struct pipe_query *
fd6_create_batch_query(struct pipe_context * pctx,unsigned num_queries,unsigned * query_types)928 fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
929 unsigned *query_types)
930 {
931 struct fd_context *ctx = fd_context(pctx);
932 struct fd_screen *screen = ctx->screen;
933 struct fd_query *q;
934 struct fd_acc_query *aq;
935 struct fd_batch_query_data *data;
936
937 data = CALLOC_VARIANT_LENGTH_STRUCT(
938 fd_batch_query_data, num_queries * sizeof(data->query_entries[0]));
939
940 data->screen = screen;
941 data->num_query_entries = num_queries;
942
943 /* validate the requested query_types and ensure we don't try
944 * to request more query_types of a given group than we have
945 * counters:
946 */
947 unsigned counters_per_group[screen->num_perfcntr_groups];
948 memset(counters_per_group, 0, sizeof(counters_per_group));
949
950 for (unsigned i = 0; i < num_queries; i++) {
951 unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
952
953 /* verify valid query_type, ie. is it actually a perfcntr? */
954 if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
955 (idx >= screen->num_perfcntr_queries)) {
956 mesa_loge("invalid batch query query_type: %u", query_types[i]);
957 goto error;
958 }
959
960 struct fd_batch_query_entry *entry = &data->query_entries[i];
961 struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
962
963 entry->gid = pq->group_id;
964
965 /* the perfcntr_queries[] table flattens all the countables
966 * for each group in series, ie:
967 *
968 * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
969 *
970 * So to find the countable index just step back through the
971 * table to find the first entry with the same group-id.
972 */
973 while (pq > screen->perfcntr_queries) {
974 pq--;
975 if (pq->group_id == entry->gid)
976 entry->cid++;
977 }
978
979 if (counters_per_group[entry->gid] >=
980 screen->perfcntr_groups[entry->gid].num_counters) {
981 mesa_loge("too many counters for group %u", entry->gid);
982 goto error;
983 }
984
985 counters_per_group[entry->gid]++;
986 }
987
988 q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
989 aq = fd_acc_query(q);
990
991 /* sample buffer size is based on # of queries: */
992 aq->size = num_queries * sizeof(struct fd6_query_sample);
993 aq->query_data = data;
994
995 return (struct pipe_query *)q;
996
997 error:
998 free(data);
999 return NULL;
1000 }
1001
1002 template <chip CHIP>
1003 void
fd6_query_context_init(struct pipe_context * pctx)1004 fd6_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
1005 {
1006 struct fd_context *ctx = fd_context(pctx);
1007
1008 ctx->create_query = fd_acc_create_query;
1009 ctx->query_update_batch = fd_acc_query_update_batch;
1010
1011 ctx->record_timestamp = record_timestamp<CHIP>;
1012 ctx->ts_to_ns = ticks_to_ns;
1013
1014 pctx->create_batch_query = fd6_create_batch_query;
1015
1016 fd_acc_query_register_provider(pctx, &occlusion_counter<CHIP>);
1017 fd_acc_query_register_provider(pctx, &occlusion_predicate<CHIP>);
1018 fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative<CHIP>);
1019
1020 fd_acc_query_register_provider(pctx, &time_elapsed<CHIP>);
1021 fd_acc_query_register_provider(pctx, ×tamp<CHIP>);
1022
1023 fd_acc_query_register_provider(pctx, &primitives_generated<CHIP>);
1024 fd_acc_query_register_provider(pctx, &pipeline_statistics_single<CHIP>);
1025
1026 fd_acc_query_register_provider(pctx, &primitives_emitted<CHIP>);
1027 fd_acc_query_register_provider(pctx, &so_overflow_any_predicate<CHIP>);
1028 fd_acc_query_register_provider(pctx, &so_overflow_predicate<CHIP>);
1029 }
1030 FD_GENX(fd6_query_context_init);
1031