1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_pipe.h"
8 #include "si_query.h"
9 #include "sid.h"
10 #include "util/u_memory.h"
11 #include "util/u_suballoc.h"
12
13 #include <stddef.h>
14
emit_shader_query(struct si_context * sctx,unsigned index)15 static void emit_shader_query(struct si_context *sctx, unsigned index)
16 {
17 assert(!list_is_empty(&sctx->shader_query_buffers));
18
19 struct gfx11_sh_query_buffer *qbuf =
20 list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
21 qbuf->head += sizeof(struct gfx11_sh_query_buffer_mem);
22 }
23
gfx11_release_query_buffers(struct si_context * sctx,struct gfx11_sh_query_buffer * first,struct gfx11_sh_query_buffer * last)24 static void gfx11_release_query_buffers(struct si_context *sctx,
25 struct gfx11_sh_query_buffer *first,
26 struct gfx11_sh_query_buffer *last)
27 {
28 while (first) {
29 struct gfx11_sh_query_buffer *qbuf = first;
30 if (first != last)
31 first = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
32 else
33 first = NULL;
34
35 qbuf->refcount--;
36 if (qbuf->refcount)
37 continue;
38
39 if (qbuf->list.next == &sctx->shader_query_buffers)
40 continue; /* keep the most recent buffer; it may not be full yet */
41 if (qbuf->list.prev == &sctx->shader_query_buffers)
42 continue; /* keep the oldest buffer for recycling */
43
44 list_del(&qbuf->list);
45 si_resource_reference(&qbuf->buf, NULL);
46 FREE(qbuf);
47 }
48 }
49
gfx11_alloc_query_buffer(struct si_context * sctx)50 static bool gfx11_alloc_query_buffer(struct si_context *sctx)
51 {
52 if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
53 return true;
54
55 struct gfx11_sh_query_buffer *qbuf = NULL;
56
57 if (!list_is_empty(&sctx->shader_query_buffers)) {
58 qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
59 if (qbuf->head + sizeof(struct gfx11_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
60 goto success;
61
62 qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
63 if (!qbuf->refcount &&
64 !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
65 sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0,
66 RADEON_USAGE_READWRITE | RADEON_USAGE_DISALLOW_SLOW_REPLY)) {
67 /* Can immediately re-use the oldest buffer */
68 list_del(&qbuf->list);
69 } else {
70 qbuf = NULL;
71 }
72 }
73
74 if (!qbuf) {
75 qbuf = CALLOC_STRUCT(gfx11_sh_query_buffer);
76 if (unlikely(!qbuf))
77 return false;
78
79 struct si_screen *screen = sctx->screen;
80 unsigned buf_size =
81 MAX2(sizeof(struct gfx11_sh_query_buffer_mem), screen->info.min_alloc_size);
82 qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
83 if (unlikely(!qbuf->buf)) {
84 FREE(qbuf);
85 return false;
86 }
87 }
88
89 /* The buffer is currently unused by the GPU. Initialize it.
90 *
91 * We need to set the high bit of all the primitive counters for
92 * compatibility with the SET_PREDICATION packet.
93 */
94 uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
95 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
96 assert(results);
97
98 for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx11_sh_query_buffer_mem); i < e;
99 ++i) {
100 for (unsigned j = 0; j < 16; ++j)
101 results[32 * i + j] = (uint64_t)1 << 63;
102 results[32 * i + 16] = 0;
103 }
104
105 list_addtail(&qbuf->list, &sctx->shader_query_buffers);
106 qbuf->head = 0;
107 qbuf->refcount = sctx->num_active_shader_queries;
108
109 success:;
110 struct pipe_shader_buffer sbuf;
111 sbuf.buffer = &qbuf->buf->b.b;
112 sbuf.buffer_offset = qbuf->head;
113 sbuf.buffer_size = sizeof(struct gfx11_sh_query_buffer_mem);
114 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf);
115 SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1);
116
117 si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
118 return true;
119 }
120
gfx11_sh_query_destroy(struct si_context * sctx,struct si_query * rquery)121 static void gfx11_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
122 {
123 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
124 gfx11_release_query_buffers(sctx, query->first, query->last);
125 FREE(query);
126 }
127
gfx11_sh_query_begin(struct si_context * sctx,struct si_query * rquery)128 static bool gfx11_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
129 {
130 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
131
132 gfx11_release_query_buffers(sctx, query->first, query->last);
133 query->first = query->last = NULL;
134
135 if (unlikely(!gfx11_alloc_query_buffer(sctx)))
136 return false;
137
138 query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
139 query->first_begin = query->first->head;
140
141 sctx->num_active_shader_queries++;
142 query->first->refcount++;
143
144 return true;
145 }
146
gfx11_sh_query_end(struct si_context * sctx,struct si_query * rquery)147 static bool gfx11_sh_query_end(struct si_context *sctx, struct si_query *rquery)
148 {
149 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
150
151 if (unlikely(!query->first))
152 return false; /* earlier out of memory error */
153
154 query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
155 query->last_end = query->last->head;
156
157 /* Signal the fence of the previous chunk */
158 if (query->last_end != 0) {
159 uint64_t fence_va = query->last->buf->gpu_address;
160 fence_va += query->last_end - sizeof(struct gfx11_sh_query_buffer_mem);
161 fence_va += offsetof(struct gfx11_sh_query_buffer_mem, fence);
162 si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
163 EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
164 0xffffffff, PIPE_QUERY_GPU_FINISHED);
165 }
166
167 sctx->num_active_shader_queries--;
168
169 if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
170 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
171 SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0);
172
173 /* If a query_begin is followed by a query_end without a draw
174 * in-between, we need to clear the atom to ensure that the
175 * next query_begin will re-initialize the shader buffer. */
176 si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
177 }
178
179 return true;
180 }
181
gfx11_sh_query_add_result(struct gfx11_sh_query * query,struct gfx11_sh_query_buffer_mem * qmem,union pipe_query_result * result)182 static void gfx11_sh_query_add_result(struct gfx11_sh_query *query,
183 struct gfx11_sh_query_buffer_mem *qmem,
184 union pipe_query_result *result)
185 {
186 static const uint64_t mask = ((uint64_t)1 << 63) - 1;
187
188 switch (query->b.type) {
189 case PIPE_QUERY_PRIMITIVES_EMITTED:
190 result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
191 break;
192 case PIPE_QUERY_PRIMITIVES_GENERATED:
193 result->u64 += qmem->stream[query->stream].generated_primitives & mask;
194 break;
195 case PIPE_QUERY_SO_STATISTICS:
196 result->so_statistics.num_primitives_written +=
197 qmem->stream[query->stream].emitted_primitives & mask;
198 result->so_statistics.primitives_storage_needed +=
199 qmem->stream[query->stream].generated_primitives & mask;
200 break;
201 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
202 result->b |= qmem->stream[query->stream].emitted_primitives !=
203 qmem->stream[query->stream].generated_primitives;
204 break;
205 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
206 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
207 result->b |= qmem->stream[stream].emitted_primitives !=
208 qmem->stream[stream].generated_primitives;
209 }
210 break;
211 default:
212 assert(0);
213 }
214 }
215
gfx11_sh_query_get_result(struct si_context * sctx,struct si_query * rquery,bool wait,union pipe_query_result * result)216 static bool gfx11_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
217 union pipe_query_result *result)
218 {
219 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
220
221 util_query_clear_result(result, query->b.type);
222
223 if (unlikely(!query->first))
224 return false; /* earlier out of memory error */
225 assert(query->last);
226
227 for (struct gfx11_sh_query_buffer *qbuf = query->last;;
228 qbuf = list_entry(qbuf->list.prev, struct gfx11_sh_query_buffer, list)) {
229 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
230 void *map;
231
232 if (rquery->b.flushed)
233 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
234 else
235 map = si_buffer_map(sctx, qbuf->buf, usage);
236
237 if (!map)
238 return false;
239
240 unsigned results_begin = 0;
241 unsigned results_end = qbuf->head;
242 if (qbuf == query->first)
243 results_begin = query->first_begin;
244 if (qbuf == query->last)
245 results_end = query->last_end;
246
247 while (results_begin != results_end) {
248 struct gfx11_sh_query_buffer_mem *qmem = map + results_begin;
249 results_begin += sizeof(*qmem);
250
251 gfx11_sh_query_add_result(query, qmem, result);
252 }
253
254 if (qbuf == query->first)
255 break;
256 }
257
258 return true;
259 }
260
gfx11_sh_query_get_result_resource(struct si_context * sctx,struct si_query * rquery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)261 static void gfx11_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
262 enum pipe_query_flags flags,
263 enum pipe_query_value_type result_type,
264 int index, struct pipe_resource *resource,
265 unsigned offset)
266 {
267 struct gfx11_sh_query *query = (struct gfx11_sh_query *)rquery;
268 struct si_qbo_state saved_state = {};
269 struct pipe_resource *tmp_buffer = NULL;
270 unsigned tmp_buffer_offset = 0;
271
272 if (!sctx->sh_query_result_shader) {
273 sctx->sh_query_result_shader = gfx11_create_sh_query_result_cs(sctx);
274 if (!sctx->sh_query_result_shader)
275 return;
276 }
277
278 if (query->first != query->last) {
279 u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
280 if (!tmp_buffer)
281 return;
282 }
283
284 si_save_qbo_state(sctx, &saved_state);
285
286 /* Pre-fill the constants configuring the shader behavior. */
287 struct {
288 uint32_t config;
289 uint32_t offset;
290 uint32_t chain;
291 uint32_t result_count;
292 } consts;
293 struct pipe_constant_buffer constant_buffer = {};
294
295 if (index >= 0) {
296 switch (query->b.type) {
297 case PIPE_QUERY_PRIMITIVES_GENERATED:
298 consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
299 consts.config = 0;
300 break;
301 case PIPE_QUERY_PRIMITIVES_EMITTED:
302 consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
303 consts.config = 0;
304 break;
305 case PIPE_QUERY_SO_STATISTICS:
306 consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
307 consts.config = 0;
308 break;
309 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
310 consts.offset = 4 * sizeof(uint64_t) * query->stream;
311 consts.config = 2;
312 break;
313 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
314 consts.offset = 0;
315 consts.config = 3;
316 break;
317 default:
318 unreachable("bad query type");
319 }
320 } else {
321 /* Check result availability. */
322 consts.offset = 0;
323 consts.config = 1;
324 }
325
326 bool is_result_64bit = result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64;
327 if (is_result_64bit)
328 consts.config |= 8;
329
330 constant_buffer.buffer_size = sizeof(consts);
331 constant_buffer.user_buffer = &consts;
332
333 /* Pre-fill the SSBOs and grid. */
334 struct pipe_shader_buffer ssbo[3];
335 struct pipe_grid_info grid = {};
336
337 ssbo[1].buffer = tmp_buffer;
338 ssbo[1].buffer_offset = tmp_buffer_offset;
339 ssbo[1].buffer_size = 16;
340
341 ssbo[2] = ssbo[1];
342
343 grid.block[0] = 1;
344 grid.block[1] = 1;
345 grid.block[2] = 1;
346 grid.grid[0] = 1;
347 grid.grid[1] = 1;
348 grid.grid[2] = 1;
349
350 /* TODO: Range-invalidate GL2 */
351 if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) {
352 sctx->barrier_flags |= SI_BARRIER_INV_L2;
353 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
354 }
355
356 struct gfx11_sh_query_buffer *qbuf = query->first;
357 for (;;) {
358 unsigned begin = qbuf == query->first ? query->first_begin : 0;
359 unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
360 if (!end)
361 continue;
362
363 ssbo[0].buffer = &qbuf->buf->b.b;
364 ssbo[0].buffer_offset = begin;
365 ssbo[0].buffer_size = end - begin;
366
367 consts.result_count = (end - begin) / sizeof(struct gfx11_sh_query_buffer_mem);
368 consts.chain = 0;
369 if (qbuf != query->first)
370 consts.chain |= 1;
371 if (qbuf != query->last)
372 consts.chain |= 2;
373
374 if (qbuf == query->last) {
375 ssbo[2].buffer = resource;
376 ssbo[2].buffer_offset = offset;
377 ssbo[2].buffer_size = is_result_64bit ? 8 : 4;
378 }
379
380 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
381
382 if (flags & PIPE_QUERY_WAIT) {
383 uint64_t va;
384
385 /* Wait for result availability. Wait only for readiness
386 * of the last entry, since the fence writes should be
387 * serialized in the CP.
388 */
389 va = qbuf->buf->gpu_address;
390 va += end - sizeof(struct gfx11_sh_query_buffer_mem);
391 va += offsetof(struct gfx11_sh_query_buffer_mem, fence);
392
393 si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
394 }
395
396 /* ssbo[2] is either tmp_buffer or resource */
397 assert(ssbo[2].buffer);
398
399 unsigned writable_bitmask = (1 << 2) | (ssbo[1].buffer ? 1 << 1 : 0);
400
401 si_barrier_before_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
402 si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader, 3, ssbo,
403 writable_bitmask, false);
404 si_barrier_after_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
405
406 if (qbuf == query->last)
407 break;
408 qbuf = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
409 }
410
411 si_restore_qbo_state(sctx, &saved_state);
412 pipe_resource_reference(&tmp_buffer, NULL);
413 }
414
415 static const struct si_query_ops gfx11_sh_query_ops = {
416 .destroy = gfx11_sh_query_destroy,
417 .begin = gfx11_sh_query_begin,
418 .end = gfx11_sh_query_end,
419 .get_result = gfx11_sh_query_get_result,
420 .get_result_resource = gfx11_sh_query_get_result_resource,
421 };
422
gfx11_sh_query_create(struct si_screen * screen,enum pipe_query_type query_type,unsigned index)423 struct pipe_query *gfx11_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
424 unsigned index)
425 {
426 struct gfx11_sh_query *query = CALLOC_STRUCT(gfx11_sh_query);
427 if (unlikely(!query))
428 return NULL;
429
430 query->b.ops = &gfx11_sh_query_ops;
431 query->b.type = query_type;
432 query->stream = index;
433
434 return (struct pipe_query *)query;
435 }
436
si_gfx11_init_query(struct si_context * sctx)437 void si_gfx11_init_query(struct si_context *sctx)
438 {
439 list_inithead(&sctx->shader_query_buffers);
440 sctx->atoms.s.shader_query.emit = emit_shader_query;
441 }
442
si_gfx11_destroy_query(struct si_context * sctx)443 void si_gfx11_destroy_query(struct si_context *sctx)
444 {
445 if (!sctx->shader_query_buffers.next)
446 return;
447
448 while (!list_is_empty(&sctx->shader_query_buffers)) {
449 struct gfx11_sh_query_buffer *qbuf =
450 list_first_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
451 list_del(&qbuf->list);
452
453 assert(!qbuf->refcount);
454 si_resource_reference(&qbuf->buf, NULL);
455 FREE(qbuf);
456 }
457 }
458