1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_pipe.h"
8 #include "si_query.h"
9 #include "sid.h"
10 #include "util/u_memory.h"
11 #include "util/u_suballoc.h"
12
13 #include <stddef.h>
14
emit_shader_query(struct si_context * sctx,unsigned index)15 static void emit_shader_query(struct si_context *sctx, unsigned index)
16 {
17 assert(!list_is_empty(&sctx->shader_query_buffers));
18
19 struct gfx11_sh_query_buffer *qbuf =
20 list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
21 qbuf->head += sizeof(struct gfx11_sh_query_buffer_mem);
22 }
23
gfx11_release_query_buffers(struct si_context * sctx,struct gfx11_sh_query_buffer * first,struct gfx11_sh_query_buffer * last)24 static void gfx11_release_query_buffers(struct si_context *sctx,
25 struct gfx11_sh_query_buffer *first,
26 struct gfx11_sh_query_buffer *last)
27 {
28 while (first) {
29 struct gfx11_sh_query_buffer *qbuf = first;
30 if (first != last)
31 first = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
32 else
33 first = NULL;
34
35 qbuf->refcount--;
36 if (qbuf->refcount)
37 continue;
38
39 if (qbuf->list.next == &sctx->shader_query_buffers)
40 continue; /* keep the most recent buffer; it may not be full yet */
41 if (qbuf->list.prev == &sctx->shader_query_buffers)
42 continue; /* keep the oldest buffer for recycling */
43
44 list_del(&qbuf->list);
45 si_resource_reference(&qbuf->buf, NULL);
46 FREE(qbuf);
47 }
48 }
49
gfx11_alloc_query_buffer(struct si_context * sctx)50 static bool gfx11_alloc_query_buffer(struct si_context *sctx)
51 {
52 if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
53 return true;
54
55 struct gfx11_sh_query_buffer *qbuf = NULL;
56
57 if (!list_is_empty(&sctx->shader_query_buffers)) {
58 qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
59 if (qbuf->head + sizeof(struct gfx11_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
60 goto success;
61
62 qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
63 if (!qbuf->refcount &&
64 !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
65 sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
66 /* Can immediately re-use the oldest buffer */
67 list_del(&qbuf->list);
68 } else {
69 qbuf = NULL;
70 }
71 }
72
73 if (!qbuf) {
74 qbuf = CALLOC_STRUCT(gfx11_sh_query_buffer);
75 if (unlikely(!qbuf))
76 return false;
77
78 struct si_screen *screen = sctx->screen;
79 unsigned buf_size =
80 MAX2(sizeof(struct gfx11_sh_query_buffer_mem), screen->info.min_alloc_size);
81 qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
82 if (unlikely(!qbuf->buf)) {
83 FREE(qbuf);
84 return false;
85 }
86 }
87
88 /* The buffer is currently unused by the GPU. Initialize it.
89 *
90 * We need to set the high bit of all the primitive counters for
91 * compatibility with the SET_PREDICATION packet.
92 */
93 uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
94 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
95 assert(results);
96
97 for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx11_sh_query_buffer_mem); i < e;
98 ++i) {
99 for (unsigned j = 0; j < 16; ++j)
100 results[32 * i + j] = (uint64_t)1 << 63;
101 results[32 * i + 16] = 0;
102 }
103
104 list_addtail(&qbuf->list, &sctx->shader_query_buffers);
105 qbuf->head = 0;
106 qbuf->refcount = sctx->num_active_shader_queries;
107
108 success:;
109 struct pipe_shader_buffer sbuf;
110 sbuf.buffer = &qbuf->buf->b.b;
111 sbuf.buffer_offset = qbuf->head;
112 sbuf.buffer_size = sizeof(struct gfx11_sh_query_buffer_mem);
113 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf);
114 SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1);
115
116 si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
117 return true;
118 }
119
gfx11_sh_query_destroy(struct si_context * sctx,struct si_query * rquery)120 static void gfx11_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
121 {
122 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
123 gfx11_release_query_buffers(sctx, query->first, query->last);
124 FREE(query);
125 }
126
gfx11_sh_query_begin(struct si_context * sctx,struct si_query * rquery)127 static bool gfx11_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
128 {
129 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
130
131 gfx11_release_query_buffers(sctx, query->first, query->last);
132 query->first = query->last = NULL;
133
134 if (unlikely(!gfx11_alloc_query_buffer(sctx)))
135 return false;
136
137 query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
138 query->first_begin = query->first->head;
139
140 sctx->num_active_shader_queries++;
141 query->first->refcount++;
142
143 return true;
144 }
145
gfx11_sh_query_end(struct si_context * sctx,struct si_query * rquery)146 static bool gfx11_sh_query_end(struct si_context *sctx, struct si_query *rquery)
147 {
148 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
149
150 if (unlikely(!query->first))
151 return false; /* earlier out of memory error */
152
153 query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
154 query->last_end = query->last->head;
155
156 /* Signal the fence of the previous chunk */
157 if (query->last_end != 0) {
158 uint64_t fence_va = query->last->buf->gpu_address;
159 fence_va += query->last_end - sizeof(struct gfx11_sh_query_buffer_mem);
160 fence_va += offsetof(struct gfx11_sh_query_buffer_mem, fence);
161 si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
162 EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
163 0xffffffff, PIPE_QUERY_GPU_FINISHED);
164 }
165
166 sctx->num_active_shader_queries--;
167
168 if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
169 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
170 SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0);
171
172 /* If a query_begin is followed by a query_end without a draw
173 * in-between, we need to clear the atom to ensure that the
174 * next query_begin will re-initialize the shader buffer. */
175 si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
176 }
177
178 return true;
179 }
180
gfx11_sh_query_add_result(struct gfx11_sh_query * query,struct gfx11_sh_query_buffer_mem * qmem,union pipe_query_result * result)181 static void gfx11_sh_query_add_result(struct gfx11_sh_query *query,
182 struct gfx11_sh_query_buffer_mem *qmem,
183 union pipe_query_result *result)
184 {
185 static const uint64_t mask = ((uint64_t)1 << 63) - 1;
186
187 switch (query->b.type) {
188 case PIPE_QUERY_PRIMITIVES_EMITTED:
189 result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
190 break;
191 case PIPE_QUERY_PRIMITIVES_GENERATED:
192 result->u64 += qmem->stream[query->stream].generated_primitives & mask;
193 break;
194 case PIPE_QUERY_SO_STATISTICS:
195 result->so_statistics.num_primitives_written +=
196 qmem->stream[query->stream].emitted_primitives & mask;
197 result->so_statistics.primitives_storage_needed +=
198 qmem->stream[query->stream].generated_primitives & mask;
199 break;
200 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
201 result->b |= qmem->stream[query->stream].emitted_primitives !=
202 qmem->stream[query->stream].generated_primitives;
203 break;
204 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
205 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
206 result->b |= qmem->stream[stream].emitted_primitives !=
207 qmem->stream[stream].generated_primitives;
208 }
209 break;
210 default:
211 assert(0);
212 }
213 }
214
gfx11_sh_query_get_result(struct si_context * sctx,struct si_query * rquery,bool wait,union pipe_query_result * result)215 static bool gfx11_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
216 union pipe_query_result *result)
217 {
218 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
219
220 util_query_clear_result(result, query->b.type);
221
222 if (unlikely(!query->first))
223 return false; /* earlier out of memory error */
224 assert(query->last);
225
226 for (struct gfx11_sh_query_buffer *qbuf = query->last;;
227 qbuf = list_entry(qbuf->list.prev, struct gfx11_sh_query_buffer, list)) {
228 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
229 void *map;
230
231 if (rquery->b.flushed)
232 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
233 else
234 map = si_buffer_map(sctx, qbuf->buf, usage);
235
236 if (!map)
237 return false;
238
239 unsigned results_begin = 0;
240 unsigned results_end = qbuf->head;
241 if (qbuf == query->first)
242 results_begin = query->first_begin;
243 if (qbuf == query->last)
244 results_end = query->last_end;
245
246 while (results_begin != results_end) {
247 struct gfx11_sh_query_buffer_mem *qmem = map + results_begin;
248 results_begin += sizeof(*qmem);
249
250 gfx11_sh_query_add_result(query, qmem, result);
251 }
252
253 if (qbuf == query->first)
254 break;
255 }
256
257 return true;
258 }
259
gfx11_sh_query_get_result_resource(struct si_context * sctx,struct si_query * rquery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)260 static void gfx11_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
261 enum pipe_query_flags flags,
262 enum pipe_query_value_type result_type,
263 int index, struct pipe_resource *resource,
264 unsigned offset)
265 {
266 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
267 struct si_qbo_state saved_state = {};
268 struct pipe_resource *tmp_buffer = NULL;
269 unsigned tmp_buffer_offset = 0;
270
271 if (!sctx->sh_query_result_shader) {
272 sctx->sh_query_result_shader = gfx11_create_sh_query_result_cs(sctx);
273 if (!sctx->sh_query_result_shader)
274 return;
275 }
276
277 if (query->first != query->last) {
278 u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
279 if (!tmp_buffer)
280 return;
281 }
282
283 si_save_qbo_state(sctx, &saved_state);
284
285 /* Pre-fill the constants configuring the shader behavior. */
286 struct {
287 uint32_t config;
288 uint32_t offset;
289 uint32_t chain;
290 uint32_t result_count;
291 } consts;
292 struct pipe_constant_buffer constant_buffer = {};
293
294 if (index >= 0) {
295 switch (query->b.type) {
296 case PIPE_QUERY_PRIMITIVES_GENERATED:
297 consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
298 consts.config = 0;
299 break;
300 case PIPE_QUERY_PRIMITIVES_EMITTED:
301 consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
302 consts.config = 0;
303 break;
304 case PIPE_QUERY_SO_STATISTICS:
305 consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
306 consts.config = 0;
307 break;
308 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
309 consts.offset = 4 * sizeof(uint64_t) * query->stream;
310 consts.config = 2;
311 break;
312 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
313 consts.offset = 0;
314 consts.config = 3;
315 break;
316 default:
317 unreachable("bad query type");
318 }
319 } else {
320 /* Check result availability. */
321 consts.offset = 0;
322 consts.config = 1;
323 }
324
325 if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
326 consts.config |= 8;
327
328 constant_buffer.buffer_size = sizeof(consts);
329 constant_buffer.user_buffer = &consts;
330
331 /* Pre-fill the SSBOs and grid. */
332 struct pipe_shader_buffer ssbo[3];
333 struct pipe_grid_info grid = {};
334
335 ssbo[1].buffer = tmp_buffer;
336 ssbo[1].buffer_offset = tmp_buffer_offset;
337 ssbo[1].buffer_size = 16;
338
339 ssbo[2] = ssbo[1];
340
341 grid.block[0] = 1;
342 grid.block[1] = 1;
343 grid.block[2] = 1;
344 grid.grid[0] = 1;
345 grid.grid[1] = 1;
346 grid.grid[2] = 1;
347
348 struct gfx11_sh_query_buffer *qbuf = query->first;
349 for (;;) {
350 unsigned begin = qbuf == query->first ? query->first_begin : 0;
351 unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
352 if (!end)
353 continue;
354
355 ssbo[0].buffer = &qbuf->buf->b.b;
356 ssbo[0].buffer_offset = begin;
357 ssbo[0].buffer_size = end - begin;
358
359 consts.result_count = (end - begin) / sizeof(struct gfx11_sh_query_buffer_mem);
360 consts.chain = 0;
361 if (qbuf != query->first)
362 consts.chain |= 1;
363 if (qbuf != query->last)
364 consts.chain |= 2;
365
366 if (qbuf == query->last) {
367 ssbo[2].buffer = resource;
368 ssbo[2].buffer_offset = offset;
369 ssbo[2].buffer_size = 8;
370 }
371
372 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
373
374 if (flags & PIPE_QUERY_WAIT) {
375 uint64_t va;
376
377 /* Wait for result availability. Wait only for readiness
378 * of the last entry, since the fence writes should be
379 * serialized in the CP.
380 */
381 va = qbuf->buf->gpu_address;
382 va += end - sizeof(struct gfx11_sh_query_buffer_mem);
383 va += offsetof(struct gfx11_sh_query_buffer_mem, fence);
384
385 si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
386 }
387
388 /* ssbo[2] is either tmp_buffer or resource */
389 assert(ssbo[2].buffer);
390 si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader,
391 SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
392 3, ssbo, (1 << 2) | (ssbo[1].buffer ? 1 << 1 : 0));
393
394 if (qbuf == query->last)
395 break;
396 qbuf = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
397 }
398
399 si_restore_qbo_state(sctx, &saved_state);
400 pipe_resource_reference(&tmp_buffer, NULL);
401 }
402
403 static const struct si_query_ops gfx11_sh_query_ops = {
404 .destroy = gfx11_sh_query_destroy,
405 .begin = gfx11_sh_query_begin,
406 .end = gfx11_sh_query_end,
407 .get_result = gfx11_sh_query_get_result,
408 .get_result_resource = gfx11_sh_query_get_result_resource,
409 };
410
gfx11_sh_query_create(struct si_screen * screen,enum pipe_query_type query_type,unsigned index)411 struct pipe_query *gfx11_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
412 unsigned index)
413 {
414 struct gfx11_sh_query *query = CALLOC_STRUCT(gfx11_sh_query);
415 if (unlikely(!query))
416 return NULL;
417
418 query->b.ops = &gfx11_sh_query_ops;
419 query->b.type = query_type;
420 query->stream = index;
421
422 return (struct pipe_query *)query;
423 }
424
si_gfx11_init_query(struct si_context * sctx)425 void si_gfx11_init_query(struct si_context *sctx)
426 {
427 list_inithead(&sctx->shader_query_buffers);
428 sctx->atoms.s.shader_query.emit = emit_shader_query;
429 }
430
si_gfx11_destroy_query(struct si_context * sctx)431 void si_gfx11_destroy_query(struct si_context *sctx)
432 {
433 if (!sctx->shader_query_buffers.next)
434 return;
435
436 while (!list_is_empty(&sctx->shader_query_buffers)) {
437 struct gfx11_sh_query_buffer *qbuf =
438 list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
439 list_del(&qbuf->list);
440
441 assert(!qbuf->refcount);
442 si_resource_reference(&qbuf->buf, NULL);
443 FREE(qbuf);
444 }
445 }
446