1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_pipe.h"
26 #include "si_query.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 #include "util/u_suballoc.h"
30
31 #include <stddef.h>
32
emit_shader_query(struct si_context * sctx)33 static void emit_shader_query(struct si_context *sctx)
34 {
35 assert(!list_is_empty(&sctx->shader_query_buffers));
36
37 struct gfx10_sh_query_buffer *qbuf =
38 list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
39 qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
40 }
41
gfx10_release_query_buffers(struct si_context * sctx,struct gfx10_sh_query_buffer * first,struct gfx10_sh_query_buffer * last)42 static void gfx10_release_query_buffers(struct si_context *sctx,
43 struct gfx10_sh_query_buffer *first,
44 struct gfx10_sh_query_buffer *last)
45 {
46 while (first) {
47 struct gfx10_sh_query_buffer *qbuf = first;
48 if (first != last)
49 first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
50 else
51 first = NULL;
52
53 qbuf->refcount--;
54 if (qbuf->refcount)
55 continue;
56
57 if (qbuf->list.next == &sctx->shader_query_buffers)
58 continue; /* keep the most recent buffer; it may not be full yet */
59 if (qbuf->list.prev == &sctx->shader_query_buffers)
60 continue; /* keep the oldest buffer for recycling */
61
62 list_del(&qbuf->list);
63 si_resource_reference(&qbuf->buf, NULL);
64 FREE(qbuf);
65 }
66 }
67
gfx10_alloc_query_buffer(struct si_context * sctx)68 static bool gfx10_alloc_query_buffer(struct si_context *sctx)
69 {
70 if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
71 return true;
72
73 struct gfx10_sh_query_buffer *qbuf = NULL;
74
75 if (!list_is_empty(&sctx->shader_query_buffers)) {
76 qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
77 if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
78 goto success;
79
80 qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
81 if (!qbuf->refcount &&
82 !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
83 sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
84 /* Can immediately re-use the oldest buffer */
85 list_del(&qbuf->list);
86 } else {
87 qbuf = NULL;
88 }
89 }
90
91 if (!qbuf) {
92 qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
93 if (unlikely(!qbuf))
94 return false;
95
96 struct si_screen *screen = sctx->screen;
97 unsigned buf_size =
98 MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
99 qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
100 if (unlikely(!qbuf->buf)) {
101 FREE(qbuf);
102 return false;
103 }
104 }
105
106 /* The buffer is currently unused by the GPU. Initialize it.
107 *
108 * We need to set the high bit of all the primitive counters for
109 * compatibility with the SET_PREDICATION packet.
110 */
111 uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
112 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
113 assert(results);
114
115 for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
116 ++i) {
117 for (unsigned j = 0; j < 16; ++j)
118 results[32 * i + j] = (uint64_t)1 << 63;
119 results[32 * i + 16] = 0;
120 }
121
122 list_addtail(&qbuf->list, &sctx->shader_query_buffers);
123 qbuf->head = 0;
124 qbuf->refcount = sctx->num_active_shader_queries;
125
126 success:;
127 struct pipe_shader_buffer sbuf;
128 sbuf.buffer = &qbuf->buf->b.b;
129 sbuf.buffer_offset = qbuf->head;
130 sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
131 si_set_internal_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
132 sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
133
134 si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
135 return true;
136 }
137
gfx10_sh_query_destroy(struct si_context * sctx,struct si_query * rquery)138 static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
139 {
140 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
141 gfx10_release_query_buffers(sctx, query->first, query->last);
142 FREE(query);
143 }
144
gfx10_sh_query_begin(struct si_context * sctx,struct si_query * rquery)145 static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
146 {
147 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
148
149 gfx10_release_query_buffers(sctx, query->first, query->last);
150 query->first = query->last = NULL;
151
152 if (unlikely(!gfx10_alloc_query_buffer(sctx)))
153 return false;
154
155 query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
156 query->first_begin = query->first->head;
157
158 sctx->num_active_shader_queries++;
159 query->first->refcount++;
160
161 return true;
162 }
163
gfx10_sh_query_end(struct si_context * sctx,struct si_query * rquery)164 static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
165 {
166 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
167
168 if (unlikely(!query->first))
169 return false; /* earlier out of memory error */
170
171 query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
172 query->last_end = query->last->head;
173
174 /* Signal the fence of the previous chunk */
175 if (query->last_end != 0) {
176 uint64_t fence_va = query->last->buf->gpu_address;
177 fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
178 fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
179 si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
180 EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
181 0xffffffff, PIPE_QUERY_GPU_FINISHED);
182 }
183
184 sctx->num_active_shader_queries--;
185
186 if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
187 si_set_internal_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
188 sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
189
190 /* If a query_begin is followed by a query_end without a draw
191 * in-between, we need to clear the atom to ensure that the
192 * next query_begin will re-initialize the shader buffer. */
193 si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
194 }
195
196 return true;
197 }
198
gfx10_sh_query_add_result(struct gfx10_sh_query * query,struct gfx10_sh_query_buffer_mem * qmem,union pipe_query_result * result)199 static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
200 struct gfx10_sh_query_buffer_mem *qmem,
201 union pipe_query_result *result)
202 {
203 static const uint64_t mask = ((uint64_t)1 << 63) - 1;
204
205 switch (query->b.type) {
206 case PIPE_QUERY_PRIMITIVES_EMITTED:
207 result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
208 break;
209 case PIPE_QUERY_PRIMITIVES_GENERATED:
210 result->u64 += qmem->stream[query->stream].generated_primitives & mask;
211 break;
212 case PIPE_QUERY_SO_STATISTICS:
213 result->so_statistics.num_primitives_written +=
214 qmem->stream[query->stream].emitted_primitives & mask;
215 result->so_statistics.primitives_storage_needed +=
216 qmem->stream[query->stream].generated_primitives & mask;
217 break;
218 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
219 result->b |= qmem->stream[query->stream].emitted_primitives !=
220 qmem->stream[query->stream].generated_primitives;
221 break;
222 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
223 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
224 result->b |= qmem->stream[stream].emitted_primitives !=
225 qmem->stream[stream].generated_primitives;
226 }
227 break;
228 default:
229 assert(0);
230 }
231 }
232
gfx10_sh_query_get_result(struct si_context * sctx,struct si_query * rquery,bool wait,union pipe_query_result * result)233 static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
234 union pipe_query_result *result)
235 {
236 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
237
238 util_query_clear_result(result, query->b.type);
239
240 if (unlikely(!query->first))
241 return false; /* earlier out of memory error */
242 assert(query->last);
243
244 for (struct gfx10_sh_query_buffer *qbuf = query->last;;
245 qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
246 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
247 void *map;
248
249 if (rquery->b.flushed)
250 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
251 else
252 map = si_buffer_map(sctx, qbuf->buf, usage);
253
254 if (!map)
255 return false;
256
257 unsigned results_begin = 0;
258 unsigned results_end = qbuf->head;
259 if (qbuf == query->first)
260 results_begin = query->first_begin;
261 if (qbuf == query->last)
262 results_end = query->last_end;
263
264 while (results_begin != results_end) {
265 struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
266 results_begin += sizeof(*qmem);
267
268 gfx10_sh_query_add_result(query, qmem, result);
269 }
270
271 if (qbuf == query->first)
272 break;
273 }
274
275 return true;
276 }
277
gfx10_sh_query_get_result_resource(struct si_context * sctx,struct si_query * rquery,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)278 static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
279 bool wait, enum pipe_query_value_type result_type,
280 int index, struct pipe_resource *resource,
281 unsigned offset)
282 {
283 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
284 struct si_qbo_state saved_state = {};
285 struct pipe_resource *tmp_buffer = NULL;
286 unsigned tmp_buffer_offset = 0;
287
288 if (!sctx->sh_query_result_shader) {
289 sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
290 if (!sctx->sh_query_result_shader)
291 return;
292 }
293
294 if (query->first != query->last) {
295 u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
296 if (!tmp_buffer)
297 return;
298 }
299
300 si_save_qbo_state(sctx, &saved_state);
301
302 /* Pre-fill the constants configuring the shader behavior. */
303 struct {
304 uint32_t config;
305 uint32_t offset;
306 uint32_t chain;
307 uint32_t result_count;
308 } consts;
309 struct pipe_constant_buffer constant_buffer = {};
310
311 if (index >= 0) {
312 switch (query->b.type) {
313 case PIPE_QUERY_PRIMITIVES_GENERATED:
314 consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
315 consts.config = 0;
316 break;
317 case PIPE_QUERY_PRIMITIVES_EMITTED:
318 consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
319 consts.config = 0;
320 break;
321 case PIPE_QUERY_SO_STATISTICS:
322 consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
323 consts.config = 0;
324 break;
325 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
326 consts.offset = 4 * sizeof(uint64_t) * query->stream;
327 consts.config = 2;
328 break;
329 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
330 consts.offset = 0;
331 consts.config = 3;
332 break;
333 default:
334 unreachable("bad query type");
335 }
336 } else {
337 /* Check result availability. */
338 consts.offset = 0;
339 consts.config = 1;
340 }
341
342 if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
343 consts.config |= 8;
344
345 constant_buffer.buffer_size = sizeof(consts);
346 constant_buffer.user_buffer = &consts;
347
348 /* Pre-fill the SSBOs and grid. */
349 struct pipe_shader_buffer ssbo[3];
350 struct pipe_grid_info grid = {};
351
352 ssbo[1].buffer = tmp_buffer;
353 ssbo[1].buffer_offset = tmp_buffer_offset;
354 ssbo[1].buffer_size = 16;
355
356 ssbo[2] = ssbo[1];
357
358 grid.block[0] = 1;
359 grid.block[1] = 1;
360 grid.block[2] = 1;
361 grid.grid[0] = 1;
362 grid.grid[1] = 1;
363 grid.grid[2] = 1;
364
365 struct gfx10_sh_query_buffer *qbuf = query->first;
366 for (;;) {
367 unsigned begin = qbuf == query->first ? query->first_begin : 0;
368 unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
369 if (!end)
370 continue;
371
372 ssbo[0].buffer = &qbuf->buf->b.b;
373 ssbo[0].buffer_offset = begin;
374 ssbo[0].buffer_size = end - begin;
375
376 consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
377 consts.chain = 0;
378 if (qbuf != query->first)
379 consts.chain |= 1;
380 if (qbuf != query->last)
381 consts.chain |= 2;
382
383 if (qbuf == query->last) {
384 ssbo[2].buffer = resource;
385 ssbo[2].buffer_offset = offset;
386 ssbo[2].buffer_size = 8;
387 }
388
389 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
390
391 if (wait) {
392 uint64_t va;
393
394 /* Wait for result availability. Wait only for readiness
395 * of the last entry, since the fence writes should be
396 * serialized in the CP.
397 */
398 va = qbuf->buf->gpu_address;
399 va += end - sizeof(struct gfx10_sh_query_buffer_mem);
400 va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
401
402 si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
403 }
404
405 si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader,
406 SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
407 3, ssbo, 0x6);
408
409 if (qbuf == query->last)
410 break;
411 qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
412 }
413
414 si_restore_qbo_state(sctx, &saved_state);
415 pipe_resource_reference(&tmp_buffer, NULL);
416 }
417
418 static const struct si_query_ops gfx10_sh_query_ops = {
419 .destroy = gfx10_sh_query_destroy,
420 .begin = gfx10_sh_query_begin,
421 .end = gfx10_sh_query_end,
422 .get_result = gfx10_sh_query_get_result,
423 .get_result_resource = gfx10_sh_query_get_result_resource,
424 };
425
gfx10_sh_query_create(struct si_screen * screen,enum pipe_query_type query_type,unsigned index)426 struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
427 unsigned index)
428 {
429 struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
430 if (unlikely(!query))
431 return NULL;
432
433 query->b.ops = &gfx10_sh_query_ops;
434 query->b.type = query_type;
435 query->stream = index;
436
437 return (struct pipe_query *)query;
438 }
439
gfx10_init_query(struct si_context * sctx)440 void gfx10_init_query(struct si_context *sctx)
441 {
442 list_inithead(&sctx->shader_query_buffers);
443 sctx->atoms.s.shader_query.emit = emit_shader_query;
444 }
445
gfx10_destroy_query(struct si_context * sctx)446 void gfx10_destroy_query(struct si_context *sctx)
447 {
448 while (!list_is_empty(&sctx->shader_query_buffers)) {
449 struct gfx10_sh_query_buffer *qbuf =
450 list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
451 list_del(&qbuf->list);
452
453 assert(!qbuf->refcount);
454 si_resource_reference(&qbuf->buf, NULL);
455 FREE(qbuf);
456 }
457 }
458