1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_pipe.h"
26 #include "si_query.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 #include "util/u_suballoc.h"
30
31 #include <stddef.h>
32
33 /**
34 * The query buffer is written to by ESGS NGG shaders with statistics about
35 * generated and (streamout-)emitted primitives.
36 *
37 * The context maintains a ring of these query buffers, and queries simply
38 * point into the ring, allowing an arbitrary number of queries to be active
39 * without additional GPU cost.
40 */
41 struct gfx10_sh_query_buffer {
42 struct list_head list;
43 struct si_resource *buf;
44 unsigned refcount;
45
46 /* Offset into the buffer in bytes; points at the first un-emitted entry. */
47 unsigned head;
48 };
49
50 /* Memory layout of the query buffer. Must be kept in sync with shaders
51 * (including QBO shaders) and should be aligned to cachelines.
52 *
53 * The somewhat awkward memory layout is for compatibility with the
54 * SET_PREDICATION packet, which also means that we're setting the high bit
55 * of all those values unconditionally.
56 */
57 struct gfx10_sh_query_buffer_mem {
58 struct {
59 uint64_t generated_primitives_start_dummy;
60 uint64_t emitted_primitives_start_dummy;
61 uint64_t generated_primitives;
62 uint64_t emitted_primitives;
63 } stream[4];
64 uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
65 uint32_t pad[31];
66 };
67
68 /* Shader-based queries. */
69 struct gfx10_sh_query {
70 struct si_query b;
71
72 struct gfx10_sh_query_buffer *first;
73 struct gfx10_sh_query_buffer *last;
74 unsigned first_begin;
75 unsigned last_end;
76
77 unsigned stream;
78 };
79
emit_shader_query(struct si_context * sctx)80 static void emit_shader_query(struct si_context *sctx)
81 {
82 assert(!list_is_empty(&sctx->shader_query_buffers));
83
84 struct gfx10_sh_query_buffer *qbuf =
85 list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
86 qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
87 }
88
gfx10_release_query_buffers(struct si_context * sctx,struct gfx10_sh_query_buffer * first,struct gfx10_sh_query_buffer * last)89 static void gfx10_release_query_buffers(struct si_context *sctx,
90 struct gfx10_sh_query_buffer *first,
91 struct gfx10_sh_query_buffer *last)
92 {
93 while (first) {
94 struct gfx10_sh_query_buffer *qbuf = first;
95 if (first != last)
96 first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
97 else
98 first = NULL;
99
100 qbuf->refcount--;
101 if (qbuf->refcount)
102 continue;
103
104 if (qbuf->list.next == &sctx->shader_query_buffers)
105 continue; /* keep the most recent buffer; it may not be full yet */
106 if (qbuf->list.prev == &sctx->shader_query_buffers)
107 continue; /* keep the oldest buffer for recycling */
108
109 list_del(&qbuf->list);
110 si_resource_reference(&qbuf->buf, NULL);
111 FREE(qbuf);
112 }
113 }
114
gfx10_alloc_query_buffer(struct si_context * sctx)115 static bool gfx10_alloc_query_buffer(struct si_context *sctx)
116 {
117 if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
118 return true;
119
120 struct gfx10_sh_query_buffer *qbuf = NULL;
121
122 if (!list_is_empty(&sctx->shader_query_buffers)) {
123 qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
124 if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
125 goto success;
126
127 qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
128 if (!qbuf->refcount &&
129 !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
130 sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
131 /* Can immediately re-use the oldest buffer */
132 list_del(&qbuf->list);
133 } else {
134 qbuf = NULL;
135 }
136 }
137
138 if (!qbuf) {
139 qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
140 if (unlikely(!qbuf))
141 return false;
142
143 struct si_screen *screen = sctx->screen;
144 unsigned buf_size =
145 MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
146 qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
147 if (unlikely(!qbuf->buf)) {
148 FREE(qbuf);
149 return false;
150 }
151 }
152
153 /* The buffer is currently unused by the GPU. Initialize it.
154 *
155 * We need to set the high bit of all the primitive counters for
156 * compatibility with the SET_PREDICATION packet.
157 */
158 uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
159 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
160 assert(results);
161
162 for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
163 ++i) {
164 for (unsigned j = 0; j < 16; ++j)
165 results[32 * i + j] = (uint64_t)1 << 63;
166 results[32 * i + 16] = 0;
167 }
168
169 list_addtail(&qbuf->list, &sctx->shader_query_buffers);
170 qbuf->head = 0;
171 qbuf->refcount = sctx->num_active_shader_queries;
172
173 success:;
174 struct pipe_shader_buffer sbuf;
175 sbuf.buffer = &qbuf->buf->b.b;
176 sbuf.buffer_offset = qbuf->head;
177 sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
178 si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
179 sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
180
181 si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
182 return true;
183 }
184
gfx10_sh_query_destroy(struct si_context * sctx,struct si_query * rquery)185 static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
186 {
187 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
188 gfx10_release_query_buffers(sctx, query->first, query->last);
189 FREE(query);
190 }
191
gfx10_sh_query_begin(struct si_context * sctx,struct si_query * rquery)192 static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
193 {
194 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
195
196 gfx10_release_query_buffers(sctx, query->first, query->last);
197 query->first = query->last = NULL;
198
199 if (unlikely(!gfx10_alloc_query_buffer(sctx)))
200 return false;
201
202 query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
203 query->first_begin = query->first->head;
204
205 sctx->num_active_shader_queries++;
206 query->first->refcount++;
207
208 return true;
209 }
210
gfx10_sh_query_end(struct si_context * sctx,struct si_query * rquery)211 static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
212 {
213 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
214
215 if (unlikely(!query->first))
216 return false; /* earlier out of memory error */
217
218 query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
219 query->last_end = query->last->head;
220
221 /* Signal the fence of the previous chunk */
222 if (query->last_end != 0) {
223 uint64_t fence_va = query->last->buf->gpu_address;
224 fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
225 fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
226 si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
227 EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
228 0xffffffff, PIPE_QUERY_GPU_FINISHED);
229 }
230
231 sctx->num_active_shader_queries--;
232
233 if (sctx->num_active_shader_queries > 0) {
234 gfx10_alloc_query_buffer(sctx);
235 } else {
236 si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
237 sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
238
239 /* If a query_begin is followed by a query_end without a draw
240 * in-between, we need to clear the atom to ensure that the
241 * next query_begin will re-initialize the shader buffer. */
242 si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
243 }
244
245 return true;
246 }
247
gfx10_sh_query_add_result(struct gfx10_sh_query * query,struct gfx10_sh_query_buffer_mem * qmem,union pipe_query_result * result)248 static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
249 struct gfx10_sh_query_buffer_mem *qmem,
250 union pipe_query_result *result)
251 {
252 static const uint64_t mask = ((uint64_t)1 << 63) - 1;
253
254 switch (query->b.type) {
255 case PIPE_QUERY_PRIMITIVES_EMITTED:
256 result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
257 break;
258 case PIPE_QUERY_PRIMITIVES_GENERATED:
259 result->u64 += qmem->stream[query->stream].generated_primitives & mask;
260 break;
261 case PIPE_QUERY_SO_STATISTICS:
262 result->so_statistics.num_primitives_written +=
263 qmem->stream[query->stream].emitted_primitives & mask;
264 result->so_statistics.primitives_storage_needed +=
265 qmem->stream[query->stream].generated_primitives & mask;
266 break;
267 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
268 result->b |= qmem->stream[query->stream].emitted_primitives !=
269 qmem->stream[query->stream].generated_primitives;
270 break;
271 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
272 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
273 result->b |= qmem->stream[stream].emitted_primitives !=
274 qmem->stream[stream].generated_primitives;
275 }
276 break;
277 default:
278 assert(0);
279 }
280 }
281
gfx10_sh_query_get_result(struct si_context * sctx,struct si_query * rquery,bool wait,union pipe_query_result * result)282 static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
283 union pipe_query_result *result)
284 {
285 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
286
287 util_query_clear_result(result, query->b.type);
288
289 if (unlikely(!query->first))
290 return false; /* earlier out of memory error */
291 assert(query->last);
292
293 for (struct gfx10_sh_query_buffer *qbuf = query->last;;
294 qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
295 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
296 void *map;
297
298 if (rquery->b.flushed)
299 map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
300 else
301 map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
302
303 if (!map)
304 return false;
305
306 unsigned results_begin = 0;
307 unsigned results_end = qbuf->head;
308 if (qbuf == query->first)
309 results_begin = query->first_begin;
310 if (qbuf == query->last)
311 results_end = query->last_end;
312
313 while (results_begin != results_end) {
314 struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
315 results_begin += sizeof(*qmem);
316
317 gfx10_sh_query_add_result(query, qmem, result);
318 }
319
320 if (qbuf == query->first)
321 break;
322 }
323
324 return true;
325 }
326
gfx10_sh_query_get_result_resource(struct si_context * sctx,struct si_query * rquery,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)327 static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
328 bool wait, enum pipe_query_value_type result_type,
329 int index, struct pipe_resource *resource,
330 unsigned offset)
331 {
332 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
333 struct si_qbo_state saved_state = {};
334 struct pipe_resource *tmp_buffer = NULL;
335 unsigned tmp_buffer_offset = 0;
336
337 if (!sctx->sh_query_result_shader) {
338 sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
339 if (!sctx->sh_query_result_shader)
340 return;
341 }
342
343 if (query->first != query->last) {
344 u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
345 if (!tmp_buffer)
346 return;
347 }
348
349 si_save_qbo_state(sctx, &saved_state);
350
351 /* Pre-fill the constants configuring the shader behavior. */
352 struct {
353 uint32_t config;
354 uint32_t offset;
355 uint32_t chain;
356 uint32_t result_count;
357 } consts;
358 struct pipe_constant_buffer constant_buffer = {};
359
360 if (index >= 0) {
361 switch (query->b.type) {
362 case PIPE_QUERY_PRIMITIVES_GENERATED:
363 consts.offset = sizeof(uint32_t) * query->stream;
364 consts.config = 0;
365 break;
366 case PIPE_QUERY_PRIMITIVES_EMITTED:
367 consts.offset = sizeof(uint32_t) * (4 + query->stream);
368 consts.config = 0;
369 break;
370 case PIPE_QUERY_SO_STATISTICS:
371 consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
372 consts.config = 0;
373 break;
374 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
375 consts.offset = sizeof(uint32_t) * query->stream;
376 consts.config = 2;
377 break;
378 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
379 consts.offset = 0;
380 consts.config = 3;
381 break;
382 default:
383 unreachable("bad query type");
384 }
385 } else {
386 /* Check result availability. */
387 consts.offset = 0;
388 consts.config = 1;
389 }
390
391 if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
392 consts.config |= 8;
393
394 constant_buffer.buffer_size = sizeof(consts);
395 constant_buffer.user_buffer = &consts;
396
397 /* Pre-fill the SSBOs and grid. */
398 struct pipe_shader_buffer ssbo[3];
399 struct pipe_grid_info grid = {};
400
401 ssbo[1].buffer = tmp_buffer;
402 ssbo[1].buffer_offset = tmp_buffer_offset;
403 ssbo[1].buffer_size = 16;
404
405 ssbo[2] = ssbo[1];
406
407 sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
408
409 grid.block[0] = 1;
410 grid.block[1] = 1;
411 grid.block[2] = 1;
412 grid.grid[0] = 1;
413 grid.grid[1] = 1;
414 grid.grid[2] = 1;
415
416 struct gfx10_sh_query_buffer *qbuf = query->first;
417 for (;;) {
418 unsigned begin = qbuf == query->first ? query->first_begin : 0;
419 unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
420 if (!end)
421 continue;
422
423 ssbo[0].buffer = &qbuf->buf->b.b;
424 ssbo[0].buffer_offset = begin;
425 ssbo[0].buffer_size = end - begin;
426
427 consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
428 consts.chain = 0;
429 if (qbuf != query->first)
430 consts.chain |= 1;
431 if (qbuf != query->last)
432 consts.chain |= 2;
433
434 if (qbuf == query->last) {
435 ssbo[2].buffer = resource;
436 ssbo[2].buffer_offset = offset;
437 ssbo[2].buffer_size = 8;
438 }
439
440 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
441 sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
442
443 if (wait) {
444 uint64_t va;
445
446 /* Wait for result availability. Wait only for readiness
447 * of the last entry, since the fence writes should be
448 * serialized in the CP.
449 */
450 va = qbuf->buf->gpu_address;
451 va += end - sizeof(struct gfx10_sh_query_buffer_mem);
452 va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
453
454 si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
455 }
456
457 sctx->b.launch_grid(&sctx->b, &grid);
458 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
459
460 if (qbuf == query->last)
461 break;
462 qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
463 }
464
465 si_restore_qbo_state(sctx, &saved_state);
466 pipe_resource_reference(&tmp_buffer, NULL);
467 }
468
469 static const struct si_query_ops gfx10_sh_query_ops = {
470 .destroy = gfx10_sh_query_destroy,
471 .begin = gfx10_sh_query_begin,
472 .end = gfx10_sh_query_end,
473 .get_result = gfx10_sh_query_get_result,
474 .get_result_resource = gfx10_sh_query_get_result_resource,
475 };
476
gfx10_sh_query_create(struct si_screen * screen,enum pipe_query_type query_type,unsigned index)477 struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
478 unsigned index)
479 {
480 struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
481 if (unlikely(!query))
482 return NULL;
483
484 query->b.ops = &gfx10_sh_query_ops;
485 query->b.type = query_type;
486 query->stream = index;
487
488 return (struct pipe_query *)query;
489 }
490
gfx10_init_query(struct si_context * sctx)491 void gfx10_init_query(struct si_context *sctx)
492 {
493 list_inithead(&sctx->shader_query_buffers);
494 sctx->atoms.s.shader_query.emit = emit_shader_query;
495 }
496
gfx10_destroy_query(struct si_context * sctx)497 void gfx10_destroy_query(struct si_context *sctx)
498 {
499 while (!list_is_empty(&sctx->shader_query_buffers)) {
500 struct gfx10_sh_query_buffer *qbuf =
501 list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
502 list_del(&qbuf->list);
503
504 assert(!qbuf->refcount);
505 si_resource_reference(&qbuf->buf, NULL);
506 FREE(qbuf);
507 }
508 }
509