• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "freedreno_context.h"
28 #include "freedreno_query_hw.h"
29 #include "freedreno_util.h"
30 
31 #include "fd4_context.h"
32 #include "fd4_draw.h"
33 #include "fd4_format.h"
34 #include "fd4_query.h"
35 
36 struct fd_rb_samp_ctrs {
37    uint64_t ctr[16];
38 };
39 
40 /*
41  * Occlusion Query:
42  *
43  * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
44  * interpret results
45  */
46 
47 static struct fd_hw_sample *
occlusion_get_sample(struct fd_batch * batch,struct fd_ringbuffer * ring)48 occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring)
49 {
50    struct fd_hw_sample *samp =
51       fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs));
52 
53    /* low bits of sample addr should be zero (since they are control
54     * flags in RB_SAMPLE_COUNT_CONTROL):
55     */
56    assert((samp->offset & 0x3) == 0);
57 
58    /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of
59     * HW_QUERY_BASE_REG register:
60     */
61    OUT_PKT3(ring, CP_SET_CONSTANT, 3);
62    OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000);
63    OUT_RING(ring, HW_QUERY_BASE_REG);
64    OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | samp->offset);
65 
66    OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3);
67    OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX,
68                         INDEX4_SIZE_32_BIT, USE_VISIBILITY));
69    OUT_RING(ring, 1); /* NumInstances */
70    OUT_RING(ring, 0); /* NumIndices */
71 
72    fd_event_write(batch, ring, ZPASS_DONE);
73 
74    return samp;
75 }
76 
77 static uint64_t
count_samples(const struct fd_rb_samp_ctrs * start,const struct fd_rb_samp_ctrs * end)78 count_samples(const struct fd_rb_samp_ctrs *start,
79               const struct fd_rb_samp_ctrs *end)
80 {
81    return end->ctr[0] - start->ctr[0];
82 }
83 
84 static void
occlusion_counter_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)85 occlusion_counter_accumulate_result(struct fd_context *ctx, const void *start,
86                                     const void *end,
87                                     union pipe_query_result *result)
88 {
89    uint64_t n = count_samples(start, end);
90    result->u64 += n;
91 }
92 
93 static void
occlusion_predicate_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)94 occlusion_predicate_accumulate_result(struct fd_context *ctx, const void *start,
95                                       const void *end,
96                                       union pipe_query_result *result)
97 {
98    uint64_t n = count_samples(start, end);
99    result->b |= (n > 0);
100 }
101 
102 /*
103  * Time Elapsed Query:
104  *
105  * Note: we could in theory support timestamp queries, but they
106  * won't give sensible results for tilers.
107  */
108 
109 static void
time_elapsed_enable(struct fd_context * ctx,struct fd_ringbuffer * ring)110 time_elapsed_enable(struct fd_context *ctx,
111                     struct fd_ringbuffer *ring) assert_dt
112 {
113    /* Right now, the assignment of countable to counter register is
114     * just hard coded.  If we start exposing more countables than we
115     * have counters, we will need to be more clever.
116     */
117    struct fd_batch *batch = fd_context_batch(ctx);
118    fd_wfi(batch, ring);
119    OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1);
120    OUT_RING(ring, CP_ALWAYS_COUNT);
121    fd_batch_reference(&batch, NULL);
122 }
123 
124 static struct fd_hw_sample *
time_elapsed_get_sample(struct fd_batch * batch,struct fd_ringbuffer * ring)125 time_elapsed_get_sample(struct fd_batch *batch,
126                         struct fd_ringbuffer *ring) assert_dt
127 {
128    struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t));
129 
130    /* use unused part of vsc_size_mem as scratch space, to avoid
131     * extra allocation:
132     */
133    struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem;
134    const int sample_off = 128;
135    const int addr_off = sample_off + 8;
136 
137    assert(batch->ctx->screen->max_freq > 0);
138 
139    /* Basic issue is that we need to read counter value to a relative
140     * destination (with per-tile offset) rather than absolute dest
141     * addr.  But there is no pm4 packet that can do that.  This is
142     * where it would be *really* nice if we could write our own fw
143     * since afaict implementing the sort of packet we need would be
144     * trivial.
145     *
146     * Instead, we:
147     * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer
148     * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer
149     * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base
150     *     address to the per-sample offset in the scratch buffer
151     * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3
152     *     to CP_ME_NRT_ADDR
153     * (5) CP_MEM_TO_REG's to copy saved counter value from scratch
154     *     buffer to CP_ME_NRT_DATA to trigger the write out to query
155     *     result buffer
156     *
157     * Straightforward, right?
158     *
159     * Maybe could swap the order of things in the scratch buffer to
160     * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one
161     * shot, but that's really just polishing a turd..
162     */
163 
164    fd_wfi(batch, ring);
165 
166    /* copy sample counter _LO and _HI to scratch: */
167    OUT_PKT3(ring, CP_REG_TO_MEM, 2);
168    OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
169                      CP_REG_TO_MEM_0_64B |
170                      CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */
171    OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
172 
173    /* ok... here we really *would* like to use the CP_SET_CONSTANT
174     * mode which can add a constant to value in reg2 and write to
175     * reg1... *but* that only works for banked/context registers,
176     * and CP_ME_NRT_DATA isn't one of those.. so we need to do some
177     * CP math to the scratch buffer instead:
178     *
179     * (note first 8 bytes are counter value, use offset 0x8 for
180     * address calculation)
181     */
182 
183    /* per-sample offset to scratch bo: */
184    OUT_PKT3(ring, CP_MEM_WRITE, 2);
185    OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
186    OUT_RING(ring, samp->offset);
187 
188    /* now add to that the per-tile base: */
189    OUT_PKT3(ring, CP_REG_TO_MEM, 2);
190    OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
191                      CP_REG_TO_MEM_0_ACCUMULATE |
192                      CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */
193    OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
194 
195    /* now copy that back to CP_ME_NRT_ADDR: */
196    OUT_PKT3(ring, CP_MEM_TO_REG, 2);
197    OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR);
198    OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
199 
200    /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA
201     * to trigger the write to result buffer
202     */
203    OUT_PKT3(ring, CP_MEM_TO_REG, 2);
204    OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
205    OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
206 
207    /* and again to get the value of the _HI reg from scratch: */
208    OUT_PKT3(ring, CP_MEM_TO_REG, 2);
209    OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA);
210    OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0);
211 
212    /* Sigh.. */
213 
214    return samp;
215 }
216 
217 static void
time_elapsed_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)218 time_elapsed_accumulate_result(struct fd_context *ctx, const void *start,
219                                const void *end, union pipe_query_result *result)
220 {
221    uint64_t n = *(uint64_t *)end - *(uint64_t *)start;
222    /* max_freq is in Hz, convert cycle count to ns: */
223    result->u64 += n * 1000000000 / ctx->screen->max_freq;
224 }
225 
226 static void
timestamp_accumulate_result(struct fd_context * ctx,const void * start,const void * end,union pipe_query_result * result)227 timestamp_accumulate_result(struct fd_context *ctx, const void *start,
228                             const void *end, union pipe_query_result *result)
229 {
230    /* just return the value from fist tile: */
231    if (result->u64 != 0)
232       return;
233    uint64_t n = *(uint64_t *)start;
234    /* max_freq is in Hz, convert cycle count to ns: */
235    result->u64 = n * 1000000000 / ctx->screen->max_freq;
236 }
237 
238 static const struct fd_hw_sample_provider occlusion_counter = {
239    .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
240    .get_sample = occlusion_get_sample,
241    .accumulate_result = occlusion_counter_accumulate_result,
242 };
243 
244 static const struct fd_hw_sample_provider occlusion_predicate = {
245    .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
246    .get_sample = occlusion_get_sample,
247    .accumulate_result = occlusion_predicate_accumulate_result,
248 };
249 
250 static const struct fd_hw_sample_provider occlusion_predicate_conservative = {
251    .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
252    .get_sample = occlusion_get_sample,
253    .accumulate_result = occlusion_predicate_accumulate_result,
254 };
255 
256 static const struct fd_hw_sample_provider time_elapsed = {
257    .query_type = PIPE_QUERY_TIME_ELAPSED,
258    .always = true,
259    .enable = time_elapsed_enable,
260    .get_sample = time_elapsed_get_sample,
261    .accumulate_result = time_elapsed_accumulate_result,
262 };
263 
264 /* NOTE: timestamp query isn't going to give terribly sensible results
265  * on a tiler.  But it is needed by qapitrace profile heatmap.  If you
266  * add in a binning pass, the results get even more non-sensical.  So
267  * we just return the timestamp on the first tile and hope that is
268  * kind of good enough.
269  */
270 static const struct fd_hw_sample_provider timestamp = {
271    .query_type = PIPE_QUERY_TIMESTAMP,
272    .always = true,
273    .enable = time_elapsed_enable,
274    .get_sample = time_elapsed_get_sample,
275    .accumulate_result = timestamp_accumulate_result,
276 };
277 
278 void
fd4_query_context_init(struct pipe_context * pctx)279 fd4_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
280 {
281    struct fd_context *ctx = fd_context(pctx);
282 
283    ctx->create_query = fd_hw_create_query;
284    ctx->query_prepare = fd_hw_query_prepare;
285    ctx->query_prepare_tile = fd_hw_query_prepare_tile;
286    ctx->query_update_batch = fd_hw_query_update_batch;
287 
288    fd_hw_query_register_provider(pctx, &occlusion_counter);
289    fd_hw_query_register_provider(pctx, &occlusion_predicate);
290    fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative);
291    fd_hw_query_register_provider(pctx, &time_elapsed);
292    fd_hw_query_register_provider(pctx, &timestamp);
293 }
294