1 /*
2 * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 /* NOTE: see https://github.com/freedreno/freedreno/wiki/A5xx-Queries */
28
29 #include "freedreno_query_acc.h"
30 #include "freedreno_resource.h"
31
32 #include "fd5_context.h"
33 #include "fd5_format.h"
34 #include "fd5_query.h"
35
36 struct PACKED fd5_query_sample {
37 uint64_t start;
38 uint64_t result;
39 uint64_t stop;
40 };
41
42 /* offset of a single field of an array of fd5_query_sample: */
43 #define query_sample_idx(aq, idx, field) \
44 fd_resource((aq)->prsc)->bo, \
45 (idx * sizeof(struct fd5_query_sample)) + \
46 offsetof(struct fd5_query_sample, field), \
47 0, 0
48
49 /* offset of a single field of fd5_query_sample: */
50 #define query_sample(aq, field) \
51 query_sample_idx(aq, 0, field)
52
53 /*
54 * Occlusion Query:
55 *
56 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
57 * interpret results
58 */
59
60 static void
occlusion_resume(struct fd_acc_query * aq,struct fd_batch * batch)61 occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
62 {
63 struct fd_ringbuffer *ring = batch->draw;
64
65 OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
66 OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
67
68 OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
69 OUT_RELOC(ring, query_sample(aq, start));
70
71 OUT_PKT7(ring, CP_EVENT_WRITE, 1);
72 OUT_RING(ring, ZPASS_DONE);
73 fd_reset_wfi(batch);
74
75 fd5_context(batch->ctx)->samples_passed_queries++;
76 }
77
78 static void
occlusion_pause(struct fd_acc_query * aq,struct fd_batch * batch)79 occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch)
80 {
81 struct fd_ringbuffer *ring = batch->draw;
82
83 OUT_PKT7(ring, CP_MEM_WRITE, 4);
84 OUT_RELOC(ring, query_sample(aq, stop));
85 OUT_RING(ring, 0xffffffff);
86 OUT_RING(ring, 0xffffffff);
87
88 OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
89
90 OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
91 OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
92
93 OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
94 OUT_RELOC(ring, query_sample(aq, stop));
95
96 OUT_PKT7(ring, CP_EVENT_WRITE, 1);
97 OUT_RING(ring, ZPASS_DONE);
98 fd_reset_wfi(batch);
99
100 OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
101 OUT_RING(ring, 0x00000014); // XXX
102 OUT_RELOC(ring, query_sample(aq, stop));
103 OUT_RING(ring, 0xffffffff);
104 OUT_RING(ring, 0xffffffff);
105 OUT_RING(ring, 0x00000010); // XXX
106
107 /* result += stop - start: */
108 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
109 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
110 CP_MEM_TO_MEM_0_NEG_C);
111 OUT_RELOC(ring, query_sample(aq, result)); /* dst */
112 OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
113 OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */
114 OUT_RELOC(ring, query_sample(aq, start)); /* srcC */
115
116 fd5_context(batch->ctx)->samples_passed_queries--;
117 }
118
119 static void
occlusion_counter_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)120 occlusion_counter_result(struct fd_acc_query *aq, void *buf,
121 union pipe_query_result *result)
122 {
123 struct fd5_query_sample *sp = buf;
124 result->u64 = sp->result;
125 }
126
127 static void
occlusion_predicate_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)128 occlusion_predicate_result(struct fd_acc_query *aq, void *buf,
129 union pipe_query_result *result)
130 {
131 struct fd5_query_sample *sp = buf;
132 result->b = !!sp->result;
133 }
134
135 static const struct fd_acc_sample_provider occlusion_counter = {
136 .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
137 .size = sizeof(struct fd5_query_sample),
138 .resume = occlusion_resume,
139 .pause = occlusion_pause,
140 .result = occlusion_counter_result,
141 };
142
143 static const struct fd_acc_sample_provider occlusion_predicate = {
144 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
145 .size = sizeof(struct fd5_query_sample),
146 .resume = occlusion_resume,
147 .pause = occlusion_pause,
148 .result = occlusion_predicate_result,
149 };
150
151 static const struct fd_acc_sample_provider occlusion_predicate_conservative = {
152 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
153 .size = sizeof(struct fd5_query_sample),
154 .resume = occlusion_resume,
155 .pause = occlusion_pause,
156 .result = occlusion_predicate_result,
157 };
158
159 /*
160 * Timestamp Queries:
161 */
162
163 static void
timestamp_resume(struct fd_acc_query * aq,struct fd_batch * batch)164 timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch)
165 {
166 struct fd_ringbuffer *ring = batch->draw;
167
168 OUT_PKT7(ring, CP_EVENT_WRITE, 4);
169 OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) |
170 CP_EVENT_WRITE_0_TIMESTAMP);
171 OUT_RELOC(ring, query_sample(aq, start));
172 OUT_RING(ring, 0x00000000);
173
174 fd_reset_wfi(batch);
175 }
176
177 static void
timestamp_pause(struct fd_acc_query * aq,struct fd_batch * batch)178 timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch)
179 {
180 struct fd_ringbuffer *ring = batch->draw;
181
182 OUT_PKT7(ring, CP_EVENT_WRITE, 4);
183 OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) |
184 CP_EVENT_WRITE_0_TIMESTAMP);
185 OUT_RELOC(ring, query_sample(aq, stop));
186 OUT_RING(ring, 0x00000000);
187
188 fd_reset_wfi(batch);
189 fd_wfi(batch, ring);
190
191 /* result += stop - start: */
192 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
193 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
194 CP_MEM_TO_MEM_0_NEG_C);
195 OUT_RELOC(ring, query_sample(aq, result)); /* dst */
196 OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
197 OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */
198 OUT_RELOC(ring, query_sample(aq, start)); /* srcC */
199 }
200
201 static uint64_t
ticks_to_ns(uint32_t ts)202 ticks_to_ns(uint32_t ts)
203 {
204 /* This is based on the 19.2MHz always-on rbbm timer.
205 *
206 * TODO we should probably query this value from kernel..
207 */
208 return ts * (1000000000 / 19200000);
209 }
210
211 static void
time_elapsed_accumulate_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)212 time_elapsed_accumulate_result(struct fd_acc_query *aq, void *buf,
213 union pipe_query_result *result)
214 {
215 struct fd5_query_sample *sp = buf;
216 result->u64 = ticks_to_ns(sp->result);
217 }
218
219 static void
timestamp_accumulate_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)220 timestamp_accumulate_result(struct fd_acc_query *aq, void *buf,
221 union pipe_query_result *result)
222 {
223 struct fd5_query_sample *sp = buf;
224 result->u64 = ticks_to_ns(sp->result);
225 }
226
227 static const struct fd_acc_sample_provider time_elapsed = {
228 .query_type = PIPE_QUERY_TIME_ELAPSED,
229 .always = true,
230 .size = sizeof(struct fd5_query_sample),
231 .resume = timestamp_resume,
232 .pause = timestamp_pause,
233 .result = time_elapsed_accumulate_result,
234 };
235
236 /* NOTE: timestamp query isn't going to give terribly sensible results
237 * on a tiler. But it is needed by qapitrace profile heatmap. If you
238 * add in a binning pass, the results get even more non-sensical. So
239 * we just return the timestamp on the first tile and hope that is
240 * kind of good enough.
241 */
242
243 static const struct fd_acc_sample_provider timestamp = {
244 .query_type = PIPE_QUERY_TIMESTAMP,
245 .always = true,
246 .size = sizeof(struct fd5_query_sample),
247 .resume = timestamp_resume,
248 .pause = timestamp_pause,
249 .result = timestamp_accumulate_result,
250 };
251
252 /*
253 * Performance Counter (batch) queries:
254 *
255 * Only one of these is active at a time, per design of the gallium
256 * batch_query API design. On perfcntr query tracks N query_types,
257 * each of which has a 'fd_batch_query_entry' that maps it back to
258 * the associated group and counter.
259 */
260
261 struct fd_batch_query_entry {
262 uint8_t gid; /* group-id */
263 uint8_t cid; /* countable-id within the group */
264 };
265
266 struct fd_batch_query_data {
267 struct fd_screen *screen;
268 unsigned num_query_entries;
269 struct fd_batch_query_entry query_entries[];
270 };
271
272 static void
perfcntr_resume(struct fd_acc_query * aq,struct fd_batch * batch)273 perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch)
274 {
275 struct fd_batch_query_data *data = aq->query_data;
276 struct fd_screen *screen = data->screen;
277 struct fd_ringbuffer *ring = batch->draw;
278
279 unsigned counters_per_group[screen->num_perfcntr_groups];
280 memset(counters_per_group, 0, sizeof(counters_per_group));
281
282 fd_wfi(batch, ring);
283
284 /* configure performance counters for the requested queries: */
285 for (unsigned i = 0; i < data->num_query_entries; i++) {
286 struct fd_batch_query_entry *entry = &data->query_entries[i];
287 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
288 unsigned counter_idx = counters_per_group[entry->gid]++;
289
290 debug_assert(counter_idx < g->num_counters);
291
292 OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
293 OUT_RING(ring, g->countables[entry->cid].selector);
294 }
295
296 memset(counters_per_group, 0, sizeof(counters_per_group));
297
298 /* and snapshot the start values */
299 for (unsigned i = 0; i < data->num_query_entries; i++) {
300 struct fd_batch_query_entry *entry = &data->query_entries[i];
301 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
302 unsigned counter_idx = counters_per_group[entry->gid]++;
303 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
304
305 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
306 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
307 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
308 OUT_RELOC(ring, query_sample_idx(aq, i, start));
309 }
310 }
311
312 static void
perfcntr_pause(struct fd_acc_query * aq,struct fd_batch * batch)313 perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch)
314 {
315 struct fd_batch_query_data *data = aq->query_data;
316 struct fd_screen *screen = data->screen;
317 struct fd_ringbuffer *ring = batch->draw;
318
319 unsigned counters_per_group[screen->num_perfcntr_groups];
320 memset(counters_per_group, 0, sizeof(counters_per_group));
321
322 fd_wfi(batch, ring);
323
324 /* TODO do we need to bother to turn anything off? */
325
326 /* snapshot the end values: */
327 for (unsigned i = 0; i < data->num_query_entries; i++) {
328 struct fd_batch_query_entry *entry = &data->query_entries[i];
329 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
330 unsigned counter_idx = counters_per_group[entry->gid]++;
331 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
332
333 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
334 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
335 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
336 OUT_RELOC(ring, query_sample_idx(aq, i, stop));
337 }
338
339 /* and compute the result: */
340 for (unsigned i = 0; i < data->num_query_entries; i++) {
341 /* result += stop - start: */
342 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
343 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
344 CP_MEM_TO_MEM_0_NEG_C);
345 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */
346 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */
347 OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */
348 OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */
349 }
350 }
351
352 static void
perfcntr_accumulate_result(struct fd_acc_query * aq,void * buf,union pipe_query_result * result)353 perfcntr_accumulate_result(struct fd_acc_query *aq, void *buf,
354 union pipe_query_result *result)
355 {
356 struct fd_batch_query_data *data = aq->query_data;
357 struct fd5_query_sample *sp = buf;
358
359 for (unsigned i = 0; i < data->num_query_entries; i++) {
360 result->batch[i].u64 = sp[i].result;
361 }
362 }
363
364 static const struct fd_acc_sample_provider perfcntr = {
365 .query_type = FD_QUERY_FIRST_PERFCNTR,
366 .always = true,
367 .resume = perfcntr_resume,
368 .pause = perfcntr_pause,
369 .result = perfcntr_accumulate_result,
370 };
371
372 static struct pipe_query *
fd5_create_batch_query(struct pipe_context * pctx,unsigned num_queries,unsigned * query_types)373 fd5_create_batch_query(struct pipe_context *pctx,
374 unsigned num_queries, unsigned *query_types)
375 {
376 struct fd_context *ctx = fd_context(pctx);
377 struct fd_screen *screen = ctx->screen;
378 struct fd_query *q;
379 struct fd_acc_query *aq;
380 struct fd_batch_query_data *data;
381
382 data = CALLOC_VARIANT_LENGTH_STRUCT(fd_batch_query_data,
383 num_queries * sizeof(data->query_entries[0]));
384
385 data->screen = screen;
386 data->num_query_entries = num_queries;
387
388 /* validate the requested query_types and ensure we don't try
389 * to request more query_types of a given group than we have
390 * counters:
391 */
392 unsigned counters_per_group[screen->num_perfcntr_groups];
393 memset(counters_per_group, 0, sizeof(counters_per_group));
394
395 for (unsigned i = 0; i < num_queries; i++) {
396 unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
397
398 /* verify valid query_type, ie. is it actually a perfcntr? */
399 if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
400 (idx >= screen->num_perfcntr_queries)) {
401 debug_printf("invalid batch query query_type: %u\n", query_types[i]);
402 goto error;
403 }
404
405 struct fd_batch_query_entry *entry = &data->query_entries[i];
406 struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
407
408 entry->gid = pq->group_id;
409
410 /* the perfcntr_queries[] table flattens all the countables
411 * for each group in series, ie:
412 *
413 * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
414 *
415 * So to find the countable index just step back through the
416 * table to find the first entry with the same group-id.
417 */
418 while (pq > screen->perfcntr_queries) {
419 pq--;
420 if (pq->group_id == entry->gid)
421 entry->cid++;
422 }
423
424 if (counters_per_group[entry->gid] >=
425 screen->perfcntr_groups[entry->gid].num_counters) {
426 debug_printf("too many counters for group %u\n", entry->gid);
427 goto error;
428 }
429
430 counters_per_group[entry->gid]++;
431 }
432
433 q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
434 aq = fd_acc_query(q);
435
436 /* sample buffer size is based on # of queries: */
437 aq->size = num_queries * sizeof(struct fd5_query_sample);
438 aq->query_data = data;
439
440 return (struct pipe_query *)q;
441
442 error:
443 free(data);
444 return NULL;
445 }
446
447 void
fd5_query_context_init(struct pipe_context * pctx)448 fd5_query_context_init(struct pipe_context *pctx)
449 {
450 struct fd_context *ctx = fd_context(pctx);
451
452 ctx->create_query = fd_acc_create_query;
453 ctx->query_set_stage = fd_acc_query_set_stage;
454
455 pctx->create_batch_query = fd5_create_batch_query;
456
457 fd_acc_query_register_provider(pctx, &occlusion_counter);
458 fd_acc_query_register_provider(pctx, &occlusion_predicate);
459 fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative);
460
461 fd_acc_query_register_provider(pctx, &time_elapsed);
462 fd_acc_query_register_provider(pctx, ×tamp);
463 }
464