• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
3  * Copyright © 2018 Google, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  * Authors:
25  *    Rob Clark <robclark@freedesktop.org>
26  */
27 
28 #define FD_BO_NO_HARDPIN 1
29 
30 /* NOTE: see https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A5xx-Queries */
31 
32 #include "freedreno_query_acc.h"
33 #include "freedreno_resource.h"
34 
35 #include "fd6_context.h"
36 #include "fd6_emit.h"
37 #include "fd6_query.h"
38 
39 /* g++ is a picky about offsets that cannot be resolved at compile time, so
40  * roll our own __offsetof()
41  */
42 #define __offsetof(type, field)                                                \
43    ({ type _x = {}; ((uint8_t *)&_x.field) - ((uint8_t *)&_x);})
44 
45 struct PACKED fd6_query_sample {
46    struct fd_acc_query_sample base;
47 
48    /* The RB_SAMPLE_COUNT_ADDR destination needs to be 16-byte aligned: */
49    uint64_t pad;
50 
51    uint64_t start;
52    uint64_t result;
53    uint64_t stop;
54 };
55 DEFINE_CAST(fd_acc_query_sample, fd6_query_sample);
56 
57 /* offset of a single field of an array of fd6_query_sample: */
58 #define query_sample_idx(aq, idx, field)                                       \
59    fd_resource((aq)->prsc)->bo,                                                \
60       (idx * sizeof(struct fd6_query_sample)) +                                \
61          offsetof(struct fd6_query_sample, field),                             \
62       0, 0
63 
64 /* offset of a single field of fd6_query_sample: */
65 #define query_sample(aq, field) query_sample_idx(aq, 0, field)
66 
67 /*
68  * Occlusion Query:
69  *
70  * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
71  * interpret results
72  */
73 
74 static void
occlusion_resume(struct fd_acc_query * aq,struct fd_batch * batch)75 occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
76 {
77    struct fd_ringbuffer *ring = batch->draw;
78 
79    ASSERT_ALIGNED(struct fd6_query_sample, start, 16);
80 
81    OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
82    OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
83 
84    OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
85    OUT_RELOC(ring, query_sample(aq, start));
86 
87    fd6_event_write(batch, ring, ZPASS_DONE, false);
88 }
89 
90 static void
occlusion_pause(struct fd_acc_query * aq,struct fd_batch * batch)91 occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
92 {
93    struct fd_ringbuffer *ring = batch->draw;
94 
95    OUT_PKT7(ring, CP_MEM_WRITE, 4);
96    OUT_RELOC(ring, query_sample(aq, stop));
97    OUT_RING(ring, 0xffffffff);
98    OUT_RING(ring, 0xffffffff);
99 
100    OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
101 
102    OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1);
103    OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY);
104 
105    ASSERT_ALIGNED(struct fd6_query_sample, stop, 16);
106 
107    OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2);
108    OUT_RELOC(ring, query_sample(aq, stop));
109 
110    fd6_event_write(batch, ring, ZPASS_DONE, false);
111 
112    /* To avoid stalling in the draw buffer, emit code the code to compute the
113     * counter delta in the epilogue ring.
114     */
115    struct fd_ringbuffer *epilogue = fd_batch_get_tile_epilogue(batch);
116 
117    OUT_PKT7(epilogue, CP_WAIT_REG_MEM, 6);
118    OUT_RING(epilogue, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
119                       CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
120    OUT_RELOC(epilogue, query_sample(aq, stop));
121    OUT_RING(epilogue, CP_WAIT_REG_MEM_3_REF(0xffffffff));
122    OUT_RING(epilogue, CP_WAIT_REG_MEM_4_MASK(0xffffffff));
123    OUT_RING(epilogue, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
124 
125    /* result += stop - start: */
126    OUT_PKT7(epilogue, CP_MEM_TO_MEM, 9);
127    OUT_RING(epilogue, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
128    OUT_RELOC(epilogue, query_sample(aq, result)); /* dst */
129    OUT_RELOC(epilogue, query_sample(aq, result)); /* srcA */
130    OUT_RELOC(epilogue, query_sample(aq, stop));   /* srcB */
131    OUT_RELOC(epilogue, query_sample(aq, start));  /* srcC */
132 }
133 
134 static void
occlusion_counter_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)135 occlusion_counter_result(struct fd_acc_query *aq,
136                          struct fd_acc_query_sample *s,
137                          union pipe_query_result *result)
138 {
139    struct fd6_query_sample *sp = fd6_query_sample(s);
140    result->u64 = sp->result;
141 }
142 
143 static void
occlusion_counter_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)144 occlusion_counter_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
145                                   enum pipe_query_value_type result_type,
146                                   int index, struct fd_resource *dst,
147                                   unsigned offset)
148 {
149    copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
150                offsetof(struct fd6_query_sample, result));
151 }
152 
153 static void
occlusion_predicate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)154 occlusion_predicate_result(struct fd_acc_query *aq,
155                            struct fd_acc_query_sample *s,
156                            union pipe_query_result *result)
157 {
158    struct fd6_query_sample *sp = fd6_query_sample(s);
159    result->b = !!sp->result;
160 }
161 
162 static void
occlusion_predicate_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)163 occlusion_predicate_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
164                                     enum pipe_query_value_type result_type,
165                                     int index, struct fd_resource *dst,
166                                     unsigned offset)
167 {
168    /* This is a bit annoying but we need to turn the result into a one or
169     * zero.. to do this use a CP_COND_WRITE to overwrite the result with
170     * a one if it is non-zero.  This doesn't change the results if the
171     * query is also read on the CPU (ie. occlusion_predicate_result()).
172     */
173    OUT_PKT7(ring, CP_COND_WRITE5, 9);
174    OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) |
175                   CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY) |
176                   CP_COND_WRITE5_0_WRITE_MEMORY);
177    OUT_RELOC(ring, query_sample(aq, result)); /* POLL_ADDR_LO/HI */
178    OUT_RING(ring, CP_COND_WRITE5_3_REF(0));
179    OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
180    OUT_RELOC(ring, query_sample(aq, result)); /* WRITE_ADDR_LO/HI */
181    OUT_RING(ring, 1);
182    OUT_RING(ring, 0);
183 
184    copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
185                offsetof(struct fd6_query_sample, result));
186 }
187 
188 static const struct fd_acc_sample_provider occlusion_counter = {
189    .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
190    .size = sizeof(struct fd6_query_sample),
191    .resume = occlusion_resume,
192    .pause = occlusion_pause,
193    .result = occlusion_counter_result,
194    .result_resource = occlusion_counter_result_resource,
195 };
196 
197 static const struct fd_acc_sample_provider occlusion_predicate = {
198    .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
199    .size = sizeof(struct fd6_query_sample),
200    .resume = occlusion_resume,
201    .pause = occlusion_pause,
202    .result = occlusion_predicate_result,
203    .result_resource = occlusion_predicate_result_resource,
204 };
205 
206 static const struct fd_acc_sample_provider occlusion_predicate_conservative = {
207    .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
208    .size = sizeof(struct fd6_query_sample),
209    .resume = occlusion_resume,
210    .pause = occlusion_pause,
211    .result = occlusion_predicate_result,
212    .result_resource = occlusion_predicate_result_resource,
213 };
214 
215 /*
216  * Timestamp Queries:
217  */
218 
219 static void
timestamp_resume(struct fd_acc_query * aq,struct fd_batch * batch)220 timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch)
221 {
222    struct fd_ringbuffer *ring = batch->draw;
223 
224    OUT_PKT7(ring, CP_EVENT_WRITE, 4);
225    OUT_RING(ring,
226             CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
227    OUT_RELOC(ring, query_sample(aq, start));
228    OUT_RING(ring, 0x00000000);
229 }
230 
231 static void
time_elapsed_pause(struct fd_acc_query * aq,struct fd_batch * batch)232 time_elapsed_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
233 {
234    struct fd_ringbuffer *ring = batch->draw;
235 
236    OUT_PKT7(ring, CP_EVENT_WRITE, 4);
237    OUT_RING(ring,
238             CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
239    OUT_RELOC(ring, query_sample(aq, stop));
240    OUT_RING(ring, 0x00000000);
241 
242    OUT_WFI5(ring);
243 
244    /* result += stop - start: */
245    OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
246    OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
247    OUT_RELOC(ring, query_sample(aq, result)); /* dst */
248    OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
249    OUT_RELOC(ring, query_sample(aq, stop));   /* srcB */
250    OUT_RELOC(ring, query_sample(aq, start));  /* srcC */
251 }
252 
253 static void
timestamp_pause(struct fd_acc_query * aq,struct fd_batch * batch)254 timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch)
255 {
256    /* We captured a timestamp in timestamp_resume(), nothing to do here. */
257 }
258 
259 /* timestamp logging for u_trace: */
260 static void
record_timestamp(struct fd_ringbuffer * ring,struct fd_bo * bo,unsigned offset)261 record_timestamp(struct fd_ringbuffer *ring, struct fd_bo *bo, unsigned offset)
262 {
263    OUT_PKT7(ring, CP_EVENT_WRITE, 4);
264    OUT_RING(ring,
265             CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
266    OUT_RELOC(ring, bo, offset, 0, 0);
267    OUT_RING(ring, 0x00000000);
268 }
269 
270 static void
time_elapsed_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)271 time_elapsed_accumulate_result(struct fd_acc_query *aq,
272                                struct fd_acc_query_sample *s,
273                                union pipe_query_result *result)
274 {
275    struct fd6_query_sample *sp = fd6_query_sample(s);
276    result->u64 = ticks_to_ns(sp->result);
277 }
278 
279 static void
time_elapsed_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)280 time_elapsed_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
281                              enum pipe_query_value_type result_type,
282                              int index, struct fd_resource *dst,
283                              unsigned offset)
284 {
285    // TODO ticks_to_ns conversion would require spinning up a compute shader?
286    copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
287                offsetof(struct fd6_query_sample, result));
288 }
289 
290 static void
timestamp_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)291 timestamp_accumulate_result(struct fd_acc_query *aq,
292                             struct fd_acc_query_sample *s,
293                             union pipe_query_result *result)
294 {
295    struct fd6_query_sample *sp = fd6_query_sample(s);
296    result->u64 = ticks_to_ns(sp->start);
297 }
298 
299 static void
timestamp_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)300 timestamp_result_resource(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
301                           enum pipe_query_value_type result_type,
302                           int index, struct fd_resource *dst,
303                           unsigned offset)
304 {
305    // TODO ticks_to_ns conversion would require spinning up a compute shader?
306    copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
307                offsetof(struct fd6_query_sample, start));
308 }
309 
310 static const struct fd_acc_sample_provider time_elapsed = {
311    .query_type = PIPE_QUERY_TIME_ELAPSED,
312    .always = true,
313    .size = sizeof(struct fd6_query_sample),
314    .resume = timestamp_resume,
315    .pause = time_elapsed_pause,
316    .result = time_elapsed_accumulate_result,
317    .result_resource = time_elapsed_result_resource,
318 };
319 
320 /* NOTE: timestamp query isn't going to give terribly sensible results
321  * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
322  * add in a binning pass, the results get even more non-sensical.  So
323  * we just return the timestamp on the last tile and hope that is
324  * kind of good enough.
325  */
326 
327 static const struct fd_acc_sample_provider timestamp = {
328    .query_type = PIPE_QUERY_TIMESTAMP,
329    .always = true,
330    .size = sizeof(struct fd6_query_sample),
331    .resume = timestamp_resume,
332    .pause = timestamp_pause,
333    .result = timestamp_accumulate_result,
334    .result_resource = timestamp_result_resource,
335 };
336 
337 struct PACKED fd6_pipeline_stats_sample {
338    struct fd_acc_query_sample base;
339 
340    uint64_t start, stop, result;
341 };
342 DEFINE_CAST(fd_acc_query_sample, fd6_pipeline_stats_sample);
343 
344 #define stats_reloc(ring, aq, field)                                           \
345    OUT_RELOC(ring, fd_resource((aq)->prsc)->bo,                                \
346              __offsetof(struct fd6_pipeline_stats_sample, field), 0, 0);
347 
348 /* Mapping of counters to pipeline stats:
349  *
350  *   Gallium (PIPE_STAT_QUERY_x) | Vulkan (VK_QUERY_PIPELINE_STATISTIC_x_BIT) | hw counter
351  *   ----------------------------+--------------------------------------------+----------------
352  *   IA_VERTICES                 | INPUT_ASSEMBLY_VERTICES                    | RBBM_PRIMCTR_0
353  *   IA_PRIMITIVES               | INPUT_ASSEMBLY_PRIMITIVES                  | RBBM_PRIMCTR_1
354  *   VS_INVOCATIONS              | VERTEX_SHADER_INVOCATIONS                  | RBBM_PRIMCTR_0
355  *   GS_INVOCATIONS              | GEOMETRY_SHADER_INVOCATIONS                | RBBM_PRIMCTR_5
356  *   GS_PRIMITIVES               | GEOMETRY_SHADER_PRIMITIVES                 | RBBM_PRIMCTR_6
357  *   C_INVOCATIONS               | CLIPPING_INVOCATIONS                       | RBBM_PRIMCTR_7
358  *   C_PRIMITIVES                | CLIPPING_PRIMITIVES                        | RBBM_PRIMCTR_8
359  *   PS_INVOCATIONS              | FRAGMENT_SHADER_INVOCATIONS                | RBBM_PRIMCTR_9
360  *   HS_INVOCATIONS              | TESSELLATION_CONTROL_SHADER_PATCHES        | RBBM_PRIMCTR_2
361  *   DS_INVOCATIONS              | TESSELLATION_EVALUATION_SHADER_INVOCATIONS | RBBM_PRIMCTR_4
362  *   CS_INVOCATIONS              | COMPUTE_SHADER_INVOCATIONS                 | RBBM_PRIMCTR_10
363  *
364  * Note that "Vertices corresponding to incomplete primitives may contribute to the count.",
365  * in our case they do not, so IA_VERTICES and VS_INVOCATIONS are the same thing.
366  */
367 
368 enum stats_type {
369    STATS_PRIMITIVE,
370    STATS_FRAGMENT,
371    STATS_COMPUTE,
372 };
373 
374 static const struct {
375    enum vgt_event_type start, stop;
376 } stats_counter_events[] = {
377       [STATS_PRIMITIVE] = { START_PRIMITIVE_CTRS, STOP_PRIMITIVE_CTRS },
378       [STATS_FRAGMENT]  = { START_FRAGMENT_CTRS,  STOP_FRAGMENT_CTRS },
379       [STATS_COMPUTE]   = { START_COMPUTE_CTRS,   STOP_COMPUTE_CTRS },
380 };
381 
382 static enum stats_type
get_stats_type(struct fd_acc_query * aq)383 get_stats_type(struct fd_acc_query *aq)
384 {
385    if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
386       return STATS_PRIMITIVE;
387 
388    switch (aq->base.index) {
389    case PIPE_STAT_QUERY_PS_INVOCATIONS: return STATS_FRAGMENT;
390    case PIPE_STAT_QUERY_CS_INVOCATIONS: return STATS_COMPUTE;
391    default:
392       return STATS_PRIMITIVE;
393    }
394 }
395 
396 static unsigned
stats_counter_index(struct fd_acc_query * aq)397 stats_counter_index(struct fd_acc_query *aq)
398 {
399    if (aq->provider->query_type == PIPE_QUERY_PRIMITIVES_GENERATED)
400       return 7;
401 
402    switch (aq->base.index) {
403    case PIPE_STAT_QUERY_IA_VERTICES:    return 0;
404    case PIPE_STAT_QUERY_IA_PRIMITIVES:  return 1;
405    case PIPE_STAT_QUERY_VS_INVOCATIONS: return 0;
406    case PIPE_STAT_QUERY_GS_INVOCATIONS: return 5;
407    case PIPE_STAT_QUERY_GS_PRIMITIVES:  return 6;
408    case PIPE_STAT_QUERY_C_INVOCATIONS:  return 7;
409    case PIPE_STAT_QUERY_C_PRIMITIVES:   return 8;
410    case PIPE_STAT_QUERY_PS_INVOCATIONS: return 9;
411    case PIPE_STAT_QUERY_HS_INVOCATIONS: return 2;
412    case PIPE_STAT_QUERY_DS_INVOCATIONS: return 4;
413    case PIPE_STAT_QUERY_CS_INVOCATIONS: return 10;
414    default:
415       return 0;
416    }
417 }
418 
419 static void
log_pipeline_stats(struct fd6_pipeline_stats_sample * ps,unsigned idx)420 log_pipeline_stats(struct fd6_pipeline_stats_sample *ps, unsigned idx)
421 {
422 #ifdef DEBUG_COUNTERS
423    const char *labels[] = {
424       "VS_INVOCATIONS",
425       "IA_PRIMITIVES",
426       "HS_INVOCATIONS",
427       "??",
428       "DS_INVOCATIONS",
429       "GS_INVOCATIONS",
430       "GS_PRIMITIVES",
431       "C_INVOCATIONS",
432       "C_PRIMITIVES",
433       "PS_INVOCATIONS",
434       "CS_INVOCATIONS",
435    };
436 
437    mesa_logd("  counter\t\tstart\t\t\tstop\t\t\tdiff");
438    mesa_logd("  RBBM_PRIMCTR_%d\t0x%016" PRIx64 "\t0x%016" PRIx64 "\t%" PRIi64 "\t%s",
439              idx, ps->start, ps->stop, ps->stop - ps->start, labels[idx]);
440 #endif
441 }
442 
443 static void
pipeline_stats_resume(struct fd_acc_query * aq,struct fd_batch * batch)444 pipeline_stats_resume(struct fd_acc_query *aq, struct fd_batch *batch)
445    assert_dt
446 {
447    struct fd_ringbuffer *ring = batch->draw;
448    enum stats_type type = get_stats_type(aq);
449    unsigned idx = stats_counter_index(aq);
450    unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
451 
452    OUT_WFI5(ring);
453 
454    OUT_PKT7(ring, CP_REG_TO_MEM, 3);
455    OUT_RING(ring, CP_REG_TO_MEM_0_64B |
456                   CP_REG_TO_MEM_0_CNT(2) |
457                   CP_REG_TO_MEM_0_REG(reg));
458    stats_reloc(ring, aq, start);
459 
460    assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
461 
462    if (!batch->pipeline_stats_queries_active[type])
463       fd6_event_write(batch, ring, stats_counter_events[type].start, false);
464    batch->pipeline_stats_queries_active[type]++;
465 }
466 
467 static void
pipeline_stats_pause(struct fd_acc_query * aq,struct fd_batch * batch)468 pipeline_stats_pause(struct fd_acc_query *aq, struct fd_batch *batch)
469    assert_dt
470 {
471    struct fd_ringbuffer *ring = batch->draw;
472    enum stats_type type = get_stats_type(aq);
473    unsigned idx = stats_counter_index(aq);
474    unsigned reg = REG_A6XX_RBBM_PRIMCTR_0_LO + (2 * idx);
475 
476    OUT_WFI5(ring);
477 
478    /* snapshot the end values: */
479    OUT_PKT7(ring, CP_REG_TO_MEM, 3);
480    OUT_RING(ring, CP_REG_TO_MEM_0_64B |
481                   CP_REG_TO_MEM_0_CNT(2) |
482                   CP_REG_TO_MEM_0_REG(reg));
483    stats_reloc(ring, aq, stop);
484 
485    assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active));
486    assert(batch->pipeline_stats_queries_active[type] > 0);
487 
488    batch->pipeline_stats_queries_active[type]--;
489    if (batch->pipeline_stats_queries_active[type])
490       fd6_event_write(batch, ring, stats_counter_events[type].stop, false);
491 
492    /* result += stop - start: */
493    OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
494    OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x40000000);
495    stats_reloc(ring, aq, result);
496    stats_reloc(ring, aq, result);
497    stats_reloc(ring, aq, stop)
498    stats_reloc(ring, aq, start);
499 }
500 
501 static void
pipeline_stats_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)502 pipeline_stats_result(struct fd_acc_query *aq,
503                       struct fd_acc_query_sample *s,
504                       union pipe_query_result *result)
505 {
506    struct fd6_pipeline_stats_sample *ps = fd6_pipeline_stats_sample(s);
507 
508    log_pipeline_stats(ps, stats_counter_index(aq));
509 
510    result->u64 = ps->result;
511 }
512 
513 static void
pipeline_stats_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)514 pipeline_stats_result_resource(struct fd_acc_query *aq,
515                                struct fd_ringbuffer *ring,
516                                enum pipe_query_value_type result_type,
517                                int index, struct fd_resource *dst,
518                                unsigned offset)
519 {
520    copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
521                offsetof(struct fd6_pipeline_stats_sample, result));
522 }
523 
524 static const struct fd_acc_sample_provider primitives_generated = {
525    .query_type = PIPE_QUERY_PRIMITIVES_GENERATED,
526    .size = sizeof(struct fd6_pipeline_stats_sample),
527    .resume = pipeline_stats_resume,
528    .pause = pipeline_stats_pause,
529    .result = pipeline_stats_result,
530    .result_resource = pipeline_stats_result_resource,
531 };
532 
533 static const struct fd_acc_sample_provider pipeline_statistics_single = {
534    .query_type = PIPE_QUERY_PIPELINE_STATISTICS_SINGLE,
535    .size = sizeof(struct fd6_pipeline_stats_sample),
536    .resume = pipeline_stats_resume,
537    .pause = pipeline_stats_pause,
538    .result = pipeline_stats_result,
539    .result_resource = pipeline_stats_result_resource,
540 };
541 
542 struct PACKED fd6_primitives_sample {
543    struct fd_acc_query_sample base;
544 
545    /* VPC_SO_STREAM_COUNTS dest address must be 32b aligned: */
546    uint64_t pad[3];
547 
548    struct {
549       uint64_t emitted, generated;
550    } start[4], stop[4], result;
551 };
552 DEFINE_CAST(fd_acc_query_sample, fd6_primitives_sample);
553 
554 #define primitives_reloc(ring, aq, field)                                      \
555    OUT_RELOC(ring, fd_resource((aq)->prsc)->bo,                                \
556              __offsetof(struct fd6_primitives_sample, field), 0, 0);
557 
558 static void
log_primitives_sample(struct fd6_primitives_sample * ps)559 log_primitives_sample(struct fd6_primitives_sample *ps)
560 {
561 #ifdef DEBUG_COUNTERS
562    mesa_logd("  so counts");
563    for (int i = 0; i < ARRAY_SIZE(ps->start); i++) {
564       mesa_logd("  CHANNEL %d emitted\t0x%016" PRIx64 "\t0x%016" PRIx64
565              "\t%" PRIi64,
566              i, ps->start[i].generated, ps->stop[i].generated,
567              ps->stop[i].generated - ps->start[i].generated);
568       mesa_logd("  CHANNEL %d generated\t0x%016" PRIx64 "\t0x%016" PRIx64
569              "\t%" PRIi64,
570              i, ps->start[i].emitted, ps->stop[i].emitted,
571              ps->stop[i].emitted - ps->start[i].emitted);
572    }
573 
574    mesa_logd("generated %" PRIu64 ", emitted %" PRIu64, ps->result.generated,
575           ps->result.emitted);
576 #endif
577 }
578 
579 static void
primitives_emitted_resume(struct fd_acc_query * aq,struct fd_batch * batch)580 primitives_emitted_resume(struct fd_acc_query *aq,
581                           struct fd_batch *batch) assert_dt
582 {
583    struct fd_ringbuffer *ring = batch->draw;
584 
585    OUT_WFI5(ring);
586 
587    ASSERT_ALIGNED(struct fd6_primitives_sample, start[0], 32);
588 
589    OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2);
590    primitives_reloc(ring, aq, start[0]);
591 
592    fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false);
593 }
594 
595 static void
accumultate_primitives_emitted(struct fd_acc_query * aq,struct fd_ringbuffer * ring,int idx)596 accumultate_primitives_emitted(struct fd_acc_query *aq,
597                                struct fd_ringbuffer *ring,
598                                int idx)
599 {
600    /* result += stop - start: */
601    OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
602    OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
603    primitives_reloc(ring, aq, result.emitted);
604    primitives_reloc(ring, aq, result.emitted);
605    primitives_reloc(ring, aq, stop[idx].emitted);
606    primitives_reloc(ring, aq, start[idx].emitted);
607 }
608 
609 static void
accumultate_primitives_generated(struct fd_acc_query * aq,struct fd_ringbuffer * ring,int idx)610 accumultate_primitives_generated(struct fd_acc_query *aq,
611                                  struct fd_ringbuffer *ring,
612                                  int idx)
613 {
614    /* result += stop - start: */
615    OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
616    OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000);
617    primitives_reloc(ring, aq, result.generated);
618    primitives_reloc(ring, aq, result.generated);
619    primitives_reloc(ring, aq, stop[idx].generated);
620    primitives_reloc(ring, aq, start[idx].generated);
621 }
622 
623 static void
primitives_emitted_pause(struct fd_acc_query * aq,struct fd_batch * batch)624 primitives_emitted_pause(struct fd_acc_query *aq,
625                          struct fd_batch *batch) assert_dt
626 {
627    struct fd_ringbuffer *ring = batch->draw;
628 
629    OUT_WFI5(ring);
630 
631    ASSERT_ALIGNED(struct fd6_primitives_sample, stop[0], 32);
632 
633    OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2);
634    primitives_reloc(ring, aq, stop[0]);
635 
636    fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false);
637 
638    fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true);
639 
640    if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
641       /* Need results from all channels: */
642       for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
643          accumultate_primitives_emitted(aq, ring, i);
644          accumultate_primitives_generated(aq, ring, i);
645       }
646    } else {
647       accumultate_primitives_emitted(aq, ring, aq->base.index);
648       /* Only need primitives generated counts for the overflow queries: */
649       if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE)
650          accumultate_primitives_generated(aq, ring, aq->base.index);
651    }
652 }
653 
654 static void
primitives_emitted_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)655 primitives_emitted_result(struct fd_acc_query *aq,
656                           struct fd_acc_query_sample *s,
657                           union pipe_query_result *result)
658 {
659    struct fd6_primitives_sample *ps = fd6_primitives_sample(s);
660 
661    log_primitives_sample(ps);
662 
663    result->u64 = ps->result.emitted;
664 }
665 
666 static void
primitives_emitted_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)667 primitives_emitted_result_resource(struct fd_acc_query *aq,
668                                    struct fd_ringbuffer *ring,
669                                    enum pipe_query_value_type result_type,
670                                    int index, struct fd_resource *dst,
671                                    unsigned offset)
672 {
673    copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc),
674                offsetof(struct fd6_primitives_sample, result.emitted));
675 }
676 
677 static void
so_overflow_predicate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)678 so_overflow_predicate_result(struct fd_acc_query *aq,
679                              struct fd_acc_query_sample *s,
680                              union pipe_query_result *result)
681 {
682    struct fd6_primitives_sample *ps = fd6_primitives_sample(s);
683 
684    log_primitives_sample(ps);
685 
686    result->b = ps->result.emitted != ps->result.generated;
687 }
688 
689 static void
so_overflow_predicate_result_resource(struct fd_acc_query * aq,struct fd_ringbuffer * ring,enum pipe_query_value_type result_type,int index,struct fd_resource * dst,unsigned offset)690 so_overflow_predicate_result_resource(struct fd_acc_query *aq,
691                                       struct fd_ringbuffer *ring,
692                                       enum pipe_query_value_type result_type,
693                                       int index, struct fd_resource *dst,
694                                       unsigned offset)
695 {
696    fd_ringbuffer_attach_bo(ring, dst->bo);
697    fd_ringbuffer_attach_bo(ring, fd_resource(aq->prsc)->bo);
698 
699    /* result = generated - emitted: */
700    OUT_PKT7(ring, CP_MEM_TO_MEM, 7);
701    OUT_RING(ring, CP_MEM_TO_MEM_0_NEG_B |
702             COND(result_type >= PIPE_QUERY_TYPE_I64, CP_MEM_TO_MEM_0_DOUBLE));
703    OUT_RELOC(ring, dst->bo, offset, 0, 0);
704    primitives_reloc(ring, aq, result.generated);
705    primitives_reloc(ring, aq, result.emitted);
706 
707    /* This is a bit awkward, but glcts expects the result to be 1 or 0
708     * rather than non-zero vs zero:
709     */
710    OUT_PKT7(ring, CP_COND_WRITE5, 9);
711    OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) |
712                   CP_COND_WRITE5_0_POLL(POLL_MEMORY) |
713                   CP_COND_WRITE5_0_WRITE_MEMORY);
714    OUT_RELOC(ring, dst->bo, offset, 0, 0);    /* POLL_ADDR_LO/HI */
715    OUT_RING(ring, CP_COND_WRITE5_3_REF(0));
716    OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0));
717    OUT_RELOC(ring, dst->bo, offset, 0, 0);    /* WRITE_ADDR_LO/HI */
718    OUT_RING(ring, 1);
719    OUT_RING(ring, 0);
720 }
721 
722 static const struct fd_acc_sample_provider primitives_emitted = {
723    .query_type = PIPE_QUERY_PRIMITIVES_EMITTED,
724    .size = sizeof(struct fd6_primitives_sample),
725    .resume = primitives_emitted_resume,
726    .pause = primitives_emitted_pause,
727    .result = primitives_emitted_result,
728    .result_resource = primitives_emitted_result_resource,
729 };
730 
731 static const struct fd_acc_sample_provider so_overflow_any_predicate = {
732    .query_type = PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE,
733    .size = sizeof(struct fd6_primitives_sample),
734    .resume = primitives_emitted_resume,
735    .pause = primitives_emitted_pause,
736    .result = so_overflow_predicate_result,
737    .result_resource = so_overflow_predicate_result_resource,
738 };
739 
740 static const struct fd_acc_sample_provider so_overflow_predicate = {
741    .query_type = PIPE_QUERY_SO_OVERFLOW_PREDICATE,
742    .size = sizeof(struct fd6_primitives_sample),
743    .resume = primitives_emitted_resume,
744    .pause = primitives_emitted_pause,
745    .result = so_overflow_predicate_result,
746    .result_resource = so_overflow_predicate_result_resource,
747 };
748 
749 /*
750  * Performance Counter (batch) queries:
751  *
752  * Only one of these is active at a time, per design of the gallium
753  * batch_query API design.  On perfcntr query tracks N query_types,
754  * each of which has a 'fd_batch_query_entry' that maps it back to
755  * the associated group and counter.
756  */
757 
758 struct fd_batch_query_entry {
759    uint8_t gid; /* group-id */
760    uint8_t cid; /* countable-id within the group */
761 };
762 
763 struct fd_batch_query_data {
764    struct fd_screen *screen;
765    unsigned num_query_entries;
766    struct fd_batch_query_entry query_entries[];
767 };
768 
769 static void
perfcntr_resume(struct fd_acc_query * aq,struct fd_batch * batch)770 perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
771 {
772    struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data;
773    struct fd_screen *screen = data->screen;
774    struct fd_ringbuffer *ring = batch->draw;
775 
776    unsigned counters_per_group[screen->num_perfcntr_groups];
777    memset(counters_per_group, 0, sizeof(counters_per_group));
778 
779    OUT_WFI5(ring);
780 
781    /* configure performance counters for the requested queries: */
782    for (unsigned i = 0; i < data->num_query_entries; i++) {
783       struct fd_batch_query_entry *entry = &data->query_entries[i];
784       const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
785       unsigned counter_idx = counters_per_group[entry->gid]++;
786 
787       assert(counter_idx < g->num_counters);
788 
789       OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
790       OUT_RING(ring, g->countables[entry->cid].selector);
791    }
792 
793    memset(counters_per_group, 0, sizeof(counters_per_group));
794 
795    /* and snapshot the start values */
796    for (unsigned i = 0; i < data->num_query_entries; i++) {
797       struct fd_batch_query_entry *entry = &data->query_entries[i];
798       const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
799       unsigned counter_idx = counters_per_group[entry->gid]++;
800       const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
801 
802       OUT_PKT7(ring, CP_REG_TO_MEM, 3);
803       OUT_RING(ring, CP_REG_TO_MEM_0_64B |
804                         CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
805       OUT_RELOC(ring, query_sample_idx(aq, i, start));
806    }
807 }
808 
809 static void
perfcntr_pause(struct fd_acc_query * aq,struct fd_batch * batch)810 perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
811 {
812    struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data;
813    struct fd_screen *screen = data->screen;
814    struct fd_ringbuffer *ring = batch->draw;
815 
816    unsigned counters_per_group[screen->num_perfcntr_groups];
817    memset(counters_per_group, 0, sizeof(counters_per_group));
818 
819    OUT_WFI5(ring);
820 
821    /* TODO do we need to bother to turn anything off? */
822 
823    /* snapshot the end values: */
824    for (unsigned i = 0; i < data->num_query_entries; i++) {
825       struct fd_batch_query_entry *entry = &data->query_entries[i];
826       const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
827       unsigned counter_idx = counters_per_group[entry->gid]++;
828       const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
829 
830       OUT_PKT7(ring, CP_REG_TO_MEM, 3);
831       OUT_RING(ring, CP_REG_TO_MEM_0_64B |
832                         CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
833       OUT_RELOC(ring, query_sample_idx(aq, i, stop));
834    }
835 
836    /* and compute the result: */
837    for (unsigned i = 0; i < data->num_query_entries; i++) {
838       /* result += stop - start: */
839       OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
840       OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
841       OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */
842       OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */
843       OUT_RELOC(ring, query_sample_idx(aq, i, stop));   /* srcB */
844       OUT_RELOC(ring, query_sample_idx(aq, i, start));  /* srcC */
845    }
846 }
847 
848 static void
perfcntr_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)849 perfcntr_accumulate_result(struct fd_acc_query *aq,
850                            struct fd_acc_query_sample *s,
851                            union pipe_query_result *result)
852 {
853    struct fd_batch_query_data *data =
854          (struct fd_batch_query_data *)aq->query_data;
855    struct fd6_query_sample *sp = fd6_query_sample(s);
856 
857    for (unsigned i = 0; i < data->num_query_entries; i++) {
858       result->batch[i].u64 = sp[i].result;
859    }
860 }
861 
862 static const struct fd_acc_sample_provider perfcntr = {
863    .query_type = FD_QUERY_FIRST_PERFCNTR,
864    .always = true,
865    .resume = perfcntr_resume,
866    .pause = perfcntr_pause,
867    .result = perfcntr_accumulate_result,
868 };
869 
870 static struct pipe_query *
fd6_create_batch_query(struct pipe_context * pctx,unsigned num_queries,unsigned * query_types)871 fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
872                        unsigned *query_types)
873 {
874    struct fd_context *ctx = fd_context(pctx);
875    struct fd_screen *screen = ctx->screen;
876    struct fd_query *q;
877    struct fd_acc_query *aq;
878    struct fd_batch_query_data *data;
879 
880    data = CALLOC_VARIANT_LENGTH_STRUCT(
881       fd_batch_query_data, num_queries * sizeof(data->query_entries[0]));
882 
883    data->screen = screen;
884    data->num_query_entries = num_queries;
885 
886    /* validate the requested query_types and ensure we don't try
887     * to request more query_types of a given group than we have
888     * counters:
889     */
890    unsigned counters_per_group[screen->num_perfcntr_groups];
891    memset(counters_per_group, 0, sizeof(counters_per_group));
892 
893    for (unsigned i = 0; i < num_queries; i++) {
894       unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
895 
896       /* verify valid query_type, ie. is it actually a perfcntr? */
897       if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
898           (idx >= screen->num_perfcntr_queries)) {
899          mesa_loge("invalid batch query query_type: %u", query_types[i]);
900          goto error;
901       }
902 
903       struct fd_batch_query_entry *entry = &data->query_entries[i];
904       struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
905 
906       entry->gid = pq->group_id;
907 
908       /* the perfcntr_queries[] table flattens all the countables
909        * for each group in series, ie:
910        *
911        *   (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
912        *
913        * So to find the countable index just step back through the
914        * table to find the first entry with the same group-id.
915        */
916       while (pq > screen->perfcntr_queries) {
917          pq--;
918          if (pq->group_id == entry->gid)
919             entry->cid++;
920       }
921 
922       if (counters_per_group[entry->gid] >=
923           screen->perfcntr_groups[entry->gid].num_counters) {
924          mesa_loge("too many counters for group %u", entry->gid);
925          goto error;
926       }
927 
928       counters_per_group[entry->gid]++;
929    }
930 
931    q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
932    aq = fd_acc_query(q);
933 
934    /* sample buffer size is based on # of queries: */
935    aq->size = num_queries * sizeof(struct fd6_query_sample);
936    aq->query_data = data;
937 
938    return (struct pipe_query *)q;
939 
940 error:
941    free(data);
942    return NULL;
943 }
944 
945 void
fd6_query_context_init(struct pipe_context * pctx)946 fd6_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
947 {
948    struct fd_context *ctx = fd_context(pctx);
949 
950    ctx->create_query = fd_acc_create_query;
951    ctx->query_update_batch = fd_acc_query_update_batch;
952 
953    ctx->record_timestamp = record_timestamp;
954    ctx->ts_to_ns = ticks_to_ns;
955 
956    pctx->create_batch_query = fd6_create_batch_query;
957 
958    fd_acc_query_register_provider(pctx, &occlusion_counter);
959    fd_acc_query_register_provider(pctx, &occlusion_predicate);
960    fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative);
961 
962    fd_acc_query_register_provider(pctx, &time_elapsed);
963    fd_acc_query_register_provider(pctx, &timestamp);
964 
965    fd_acc_query_register_provider(pctx, &primitives_generated);
966    fd_acc_query_register_provider(pctx, &pipeline_statistics_single);
967 
968    fd_acc_query_register_provider(pctx, &primitives_emitted);
969    fd_acc_query_register_provider(pctx, &so_overflow_any_predicate);
970    fd_acc_query_register_provider(pctx, &so_overflow_predicate);
971 }
972