1 /*
2 * Copyright (C) 2017 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 /* NOTE: see https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A5xx-Queries */
28
29 #include "freedreno_query_acc.h"
30 #include "freedreno_resource.h"
31
32 #include "fd5_context.h"
33 #include "fd5_emit.h"
34 #include "fd5_format.h"
35 #include "fd5_query.h"
36
37 struct PACKED fd5_query_sample {
38 struct fd_acc_query_sample base;
39
40 /* The RB_SAMPLE_COUNT_ADDR destination needs to be 16-byte aligned: */
41 uint64_t pad;
42
43 uint64_t start;
44 uint64_t result;
45 uint64_t stop;
46 };
47 DEFINE_CAST(fd_acc_query_sample, fd5_query_sample);
48
49 /* offset of a single field of an array of fd5_query_sample: */
50 #define query_sample_idx(aq, idx, field) \
51 fd_resource((aq)->prsc)->bo, \
52 (idx * sizeof(struct fd5_query_sample)) + \
53 offsetof(struct fd5_query_sample, field), \
54 0, 0
55
56 /* offset of a single field of fd5_query_sample: */
57 #define query_sample(aq, field) query_sample_idx(aq, 0, field)
58
59 /*
60 * Occlusion Query:
61 *
62 * OCCLUSION_COUNTER and OCCLUSION_PREDICATE differ only in how they
63 * interpret results
64 */
65
66 static void
occlusion_resume(struct fd_acc_query * aq,struct fd_batch * batch)67 occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch)
68 {
69 struct fd_ringbuffer *ring = batch->draw;
70
71 OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
72 OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
73
74 ASSERT_ALIGNED(struct fd5_query_sample, start, 16);
75
76 OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
77 OUT_RELOC(ring, query_sample(aq, start));
78
79 fd5_event_write(batch, ring, ZPASS_DONE, false);
80 fd_reset_wfi(batch);
81
82 fd5_context(batch->ctx)->samples_passed_queries++;
83 }
84
85 static void
occlusion_pause(struct fd_acc_query * aq,struct fd_batch * batch)86 occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch)
87 {
88 struct fd_ringbuffer *ring = batch->draw;
89
90 OUT_PKT7(ring, CP_MEM_WRITE, 4);
91 OUT_RELOC(ring, query_sample(aq, stop));
92 OUT_RING(ring, 0xffffffff);
93 OUT_RING(ring, 0xffffffff);
94
95 OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
96
97 OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1);
98 OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY);
99
100 ASSERT_ALIGNED(struct fd5_query_sample, stop, 16);
101
102 OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2);
103 OUT_RELOC(ring, query_sample(aq, stop));
104
105 fd5_event_write(batch, ring, ZPASS_DONE, false);
106 fd_reset_wfi(batch);
107
108 OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
109 OUT_RING(ring, 0x00000014); // XXX
110 OUT_RELOC(ring, query_sample(aq, stop));
111 OUT_RING(ring, 0xffffffff);
112 OUT_RING(ring, 0xffffffff);
113 OUT_RING(ring, 0x00000010); // XXX
114
115 /* result += stop - start: */
116 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
117 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
118 OUT_RELOC(ring, query_sample(aq, result)); /* dst */
119 OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
120 OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */
121 OUT_RELOC(ring, query_sample(aq, start)); /* srcC */
122
123 fd5_context(batch->ctx)->samples_passed_queries--;
124 }
125
126 static void
occlusion_counter_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)127 occlusion_counter_result(struct fd_acc_query *aq,
128 struct fd_acc_query_sample *s,
129 union pipe_query_result *result)
130 {
131 struct fd5_query_sample *sp = fd5_query_sample(s);
132 result->u64 = sp->result;
133 }
134
135 static void
occlusion_predicate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)136 occlusion_predicate_result(struct fd_acc_query *aq,
137 struct fd_acc_query_sample *s,
138 union pipe_query_result *result)
139 {
140 struct fd5_query_sample *sp = fd5_query_sample(s);
141 result->b = !!sp->result;
142 }
143
144 static const struct fd_acc_sample_provider occlusion_counter = {
145 .query_type = PIPE_QUERY_OCCLUSION_COUNTER,
146 .size = sizeof(struct fd5_query_sample),
147 .resume = occlusion_resume,
148 .pause = occlusion_pause,
149 .result = occlusion_counter_result,
150 };
151
152 static const struct fd_acc_sample_provider occlusion_predicate = {
153 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE,
154 .size = sizeof(struct fd5_query_sample),
155 .resume = occlusion_resume,
156 .pause = occlusion_pause,
157 .result = occlusion_predicate_result,
158 };
159
160 static const struct fd_acc_sample_provider occlusion_predicate_conservative = {
161 .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
162 .size = sizeof(struct fd5_query_sample),
163 .resume = occlusion_resume,
164 .pause = occlusion_pause,
165 .result = occlusion_predicate_result,
166 };
167
168 /*
169 * Timestamp Queries:
170 */
171
172 static void
timestamp_resume(struct fd_acc_query * aq,struct fd_batch * batch)173 timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
174 {
175 struct fd_ringbuffer *ring = batch->draw;
176
177 OUT_PKT7(ring, CP_EVENT_WRITE, 4);
178 OUT_RING(ring,
179 CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
180 OUT_RELOC(ring, query_sample(aq, start));
181 OUT_RING(ring, 0x00000000);
182
183 fd_reset_wfi(batch);
184 }
185
186 static void
timestamp_pause(struct fd_acc_query * aq,struct fd_batch * batch)187 timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
188 {
189 struct fd_ringbuffer *ring = batch->draw;
190
191 OUT_PKT7(ring, CP_EVENT_WRITE, 4);
192 OUT_RING(ring,
193 CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
194 OUT_RELOC(ring, query_sample(aq, stop));
195 OUT_RING(ring, 0x00000000);
196
197 fd_reset_wfi(batch);
198 fd_wfi(batch, ring);
199
200 /* result += stop - start: */
201 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
202 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
203 OUT_RELOC(ring, query_sample(aq, result)); /* dst */
204 OUT_RELOC(ring, query_sample(aq, result)); /* srcA */
205 OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */
206 OUT_RELOC(ring, query_sample(aq, start)); /* srcC */
207 }
208
209 static void
time_elapsed_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)210 time_elapsed_accumulate_result(struct fd_acc_query *aq,
211 struct fd_acc_query_sample *s,
212 union pipe_query_result *result)
213 {
214 struct fd5_query_sample *sp = fd5_query_sample(s);
215 result->u64 = ticks_to_ns(sp->result);
216 }
217
218 static void
timestamp_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)219 timestamp_accumulate_result(struct fd_acc_query *aq,
220 struct fd_acc_query_sample *s,
221 union pipe_query_result *result)
222 {
223 struct fd5_query_sample *sp = fd5_query_sample(s);
224 result->u64 = ticks_to_ns(sp->result);
225 }
226
227 static const struct fd_acc_sample_provider time_elapsed = {
228 .query_type = PIPE_QUERY_TIME_ELAPSED,
229 .always = true,
230 .size = sizeof(struct fd5_query_sample),
231 .resume = timestamp_resume,
232 .pause = timestamp_pause,
233 .result = time_elapsed_accumulate_result,
234 };
235
236 /* NOTE: timestamp query isn't going to give terribly sensible results
237 * on a tiler. But it is needed by qapitrace profile heatmap. If you
238 * add in a binning pass, the results get even more non-sensical. So
239 * we just return the timestamp on the first tile and hope that is
240 * kind of good enough.
241 */
242
243 static const struct fd_acc_sample_provider timestamp = {
244 .query_type = PIPE_QUERY_TIMESTAMP,
245 .always = true,
246 .size = sizeof(struct fd5_query_sample),
247 .resume = timestamp_resume,
248 .pause = timestamp_pause,
249 .result = timestamp_accumulate_result,
250 };
251
252 /*
253 * Performance Counter (batch) queries:
254 *
255 * Only one of these is active at a time, per design of the gallium
256 * batch_query API design. On perfcntr query tracks N query_types,
257 * each of which has a 'fd_batch_query_entry' that maps it back to
258 * the associated group and counter.
259 */
260
261 struct fd_batch_query_entry {
262 uint8_t gid; /* group-id */
263 uint8_t cid; /* countable-id within the group */
264 };
265
266 struct fd_batch_query_data {
267 struct fd_screen *screen;
268 unsigned num_query_entries;
269 struct fd_batch_query_entry query_entries[];
270 };
271
272 static void
perfcntr_resume(struct fd_acc_query * aq,struct fd_batch * batch)273 perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
274 {
275 struct fd_batch_query_data *data = aq->query_data;
276 struct fd_screen *screen = data->screen;
277 struct fd_ringbuffer *ring = batch->draw;
278
279 unsigned counters_per_group[screen->num_perfcntr_groups];
280 memset(counters_per_group, 0, sizeof(counters_per_group));
281
282 fd_wfi(batch, ring);
283
284 /* configure performance counters for the requested queries: */
285 for (unsigned i = 0; i < data->num_query_entries; i++) {
286 struct fd_batch_query_entry *entry = &data->query_entries[i];
287 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
288 unsigned counter_idx = counters_per_group[entry->gid]++;
289
290 assert(counter_idx < g->num_counters);
291
292 OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
293 OUT_RING(ring, g->countables[entry->cid].selector);
294 }
295
296 memset(counters_per_group, 0, sizeof(counters_per_group));
297
298 /* and snapshot the start values */
299 for (unsigned i = 0; i < data->num_query_entries; i++) {
300 struct fd_batch_query_entry *entry = &data->query_entries[i];
301 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
302 unsigned counter_idx = counters_per_group[entry->gid]++;
303 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
304
305 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
306 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
307 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
308 OUT_RELOC(ring, query_sample_idx(aq, i, start));
309 }
310 }
311
312 static void
perfcntr_pause(struct fd_acc_query * aq,struct fd_batch * batch)313 perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
314 {
315 struct fd_batch_query_data *data = aq->query_data;
316 struct fd_screen *screen = data->screen;
317 struct fd_ringbuffer *ring = batch->draw;
318
319 unsigned counters_per_group[screen->num_perfcntr_groups];
320 memset(counters_per_group, 0, sizeof(counters_per_group));
321
322 fd_wfi(batch, ring);
323
324 /* TODO do we need to bother to turn anything off? */
325
326 /* snapshot the end values: */
327 for (unsigned i = 0; i < data->num_query_entries; i++) {
328 struct fd_batch_query_entry *entry = &data->query_entries[i];
329 const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
330 unsigned counter_idx = counters_per_group[entry->gid]++;
331 const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
332
333 OUT_PKT7(ring, CP_REG_TO_MEM, 3);
334 OUT_RING(ring, CP_REG_TO_MEM_0_64B |
335 CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
336 OUT_RELOC(ring, query_sample_idx(aq, i, stop));
337 }
338
339 /* and compute the result: */
340 for (unsigned i = 0; i < data->num_query_entries; i++) {
341 /* result += stop - start: */
342 OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
343 OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
344 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */
345 OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */
346 OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */
347 OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */
348 }
349 }
350
351 static void
perfcntr_accumulate_result(struct fd_acc_query * aq,struct fd_acc_query_sample * s,union pipe_query_result * result)352 perfcntr_accumulate_result(struct fd_acc_query *aq,
353 struct fd_acc_query_sample *s,
354 union pipe_query_result *result)
355 {
356 struct fd_batch_query_data *data = aq->query_data;
357 struct fd5_query_sample *sp = fd5_query_sample(s);
358
359 for (unsigned i = 0; i < data->num_query_entries; i++) {
360 result->batch[i].u64 = sp[i].result;
361 }
362 }
363
364 static const struct fd_acc_sample_provider perfcntr = {
365 .query_type = FD_QUERY_FIRST_PERFCNTR,
366 .always = true,
367 .resume = perfcntr_resume,
368 .pause = perfcntr_pause,
369 .result = perfcntr_accumulate_result,
370 };
371
372 static struct pipe_query *
fd5_create_batch_query(struct pipe_context * pctx,unsigned num_queries,unsigned * query_types)373 fd5_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
374 unsigned *query_types)
375 {
376 struct fd_context *ctx = fd_context(pctx);
377 struct fd_screen *screen = ctx->screen;
378 struct fd_query *q;
379 struct fd_acc_query *aq;
380 struct fd_batch_query_data *data;
381
382 data = CALLOC_VARIANT_LENGTH_STRUCT(
383 fd_batch_query_data, num_queries * sizeof(data->query_entries[0]));
384
385 data->screen = screen;
386 data->num_query_entries = num_queries;
387
388 /* validate the requested query_types and ensure we don't try
389 * to request more query_types of a given group than we have
390 * counters:
391 */
392 unsigned counters_per_group[screen->num_perfcntr_groups];
393 memset(counters_per_group, 0, sizeof(counters_per_group));
394
395 for (unsigned i = 0; i < num_queries; i++) {
396 unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
397
398 /* verify valid query_type, ie. is it actually a perfcntr? */
399 if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
400 (idx >= screen->num_perfcntr_queries)) {
401 mesa_loge("invalid batch query query_type: %u", query_types[i]);
402 goto error;
403 }
404
405 struct fd_batch_query_entry *entry = &data->query_entries[i];
406 struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
407
408 entry->gid = pq->group_id;
409
410 /* the perfcntr_queries[] table flattens all the countables
411 * for each group in series, ie:
412 *
413 * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
414 *
415 * So to find the countable index just step back through the
416 * table to find the first entry with the same group-id.
417 */
418 while (pq > screen->perfcntr_queries) {
419 pq--;
420 if (pq->group_id == entry->gid)
421 entry->cid++;
422 }
423
424 if (counters_per_group[entry->gid] >=
425 screen->perfcntr_groups[entry->gid].num_counters) {
426 mesa_loge("too many counters for group %u\n", entry->gid);
427 goto error;
428 }
429
430 counters_per_group[entry->gid]++;
431 }
432
433 q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
434 aq = fd_acc_query(q);
435
436 /* sample buffer size is based on # of queries: */
437 aq->size = num_queries * sizeof(struct fd5_query_sample);
438 aq->query_data = data;
439
440 return (struct pipe_query *)q;
441
442 error:
443 free(data);
444 return NULL;
445 }
446
447 void
fd5_query_context_init(struct pipe_context * pctx)448 fd5_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis
449 {
450 struct fd_context *ctx = fd_context(pctx);
451
452 ctx->create_query = fd_acc_create_query;
453 ctx->query_update_batch = fd_acc_query_update_batch;
454
455 pctx->create_batch_query = fd5_create_batch_query;
456
457 fd_acc_query_register_provider(pctx, &occlusion_counter);
458 fd_acc_query_register_provider(pctx, &occlusion_predicate);
459 fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative);
460
461 fd_acc_query_register_provider(pctx, &time_elapsed);
462 fd_acc_query_register_provider(pctx, ×tamp);
463 }
464