1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4 * Copyright 2018 Advanced Micro Devices, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "si_query.h"
28 #include "si_build_pm4.h"
29
30 #include "amd/common/sid.h"
31 #include "si_pipe.h"
32 #include "util/os_time.h"
33 #include "util/u_memory.h"
34 #include "util/u_suballoc.h"
35 #include "util/u_upload_mgr.h"
36
37 static const struct si_query_ops query_hw_ops;
38
39 struct si_hw_query_params {
40 unsigned start_offset;
41 unsigned end_offset;
42 unsigned fence_offset;
43 unsigned pair_stride;
44 unsigned pair_count;
45 };
46
47 /* Queries without buffer handling or suspend/resume. */
48 struct si_query_sw {
49 struct si_query b;
50
51 uint64_t begin_result;
52 uint64_t end_result;
53
54 uint64_t begin_time;
55 uint64_t end_time;
56
57 /* Fence for GPU_FINISHED. */
58 struct pipe_fence_handle *fence;
59 };
60
si_query_sw_destroy(struct si_context * sctx,struct si_query * squery)61 static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
62 {
63 struct si_query_sw *query = (struct si_query_sw *)squery;
64
65 sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
66 FREE(query);
67 }
68
winsys_id_from_type(unsigned type)69 static enum radeon_value_id winsys_id_from_type(unsigned type)
70 {
71 switch (type) {
72 case SI_QUERY_REQUESTED_VRAM:
73 return RADEON_REQUESTED_VRAM_MEMORY;
74 case SI_QUERY_REQUESTED_GTT:
75 return RADEON_REQUESTED_GTT_MEMORY;
76 case SI_QUERY_MAPPED_VRAM:
77 return RADEON_MAPPED_VRAM;
78 case SI_QUERY_MAPPED_GTT:
79 return RADEON_MAPPED_GTT;
80 case SI_QUERY_SLAB_WASTED_VRAM:
81 return RADEON_SLAB_WASTED_VRAM;
82 case SI_QUERY_SLAB_WASTED_GTT:
83 return RADEON_SLAB_WASTED_GTT;
84 case SI_QUERY_BUFFER_WAIT_TIME:
85 return RADEON_BUFFER_WAIT_TIME_NS;
86 case SI_QUERY_NUM_MAPPED_BUFFERS:
87 return RADEON_NUM_MAPPED_BUFFERS;
88 case SI_QUERY_NUM_GFX_IBS:
89 return RADEON_NUM_GFX_IBS;
90 case SI_QUERY_GFX_BO_LIST_SIZE:
91 return RADEON_GFX_BO_LIST_COUNTER;
92 case SI_QUERY_GFX_IB_SIZE:
93 return RADEON_GFX_IB_SIZE_COUNTER;
94 case SI_QUERY_NUM_BYTES_MOVED:
95 return RADEON_NUM_BYTES_MOVED;
96 case SI_QUERY_NUM_EVICTIONS:
97 return RADEON_NUM_EVICTIONS;
98 case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
99 return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
100 case SI_QUERY_VRAM_USAGE:
101 return RADEON_VRAM_USAGE;
102 case SI_QUERY_VRAM_VIS_USAGE:
103 return RADEON_VRAM_VIS_USAGE;
104 case SI_QUERY_GTT_USAGE:
105 return RADEON_GTT_USAGE;
106 case SI_QUERY_GPU_TEMPERATURE:
107 return RADEON_GPU_TEMPERATURE;
108 case SI_QUERY_CURRENT_GPU_SCLK:
109 return RADEON_CURRENT_SCLK;
110 case SI_QUERY_CURRENT_GPU_MCLK:
111 return RADEON_CURRENT_MCLK;
112 case SI_QUERY_CS_THREAD_BUSY:
113 return RADEON_CS_THREAD_TIME;
114 default:
115 unreachable("query type does not correspond to winsys id");
116 }
117 }
118
si_query_sw_begin(struct si_context * sctx,struct si_query * squery)119 static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
120 {
121 struct si_query_sw *query = (struct si_query_sw *)squery;
122 enum radeon_value_id ws_id;
123
124 switch (query->b.type) {
125 case PIPE_QUERY_TIMESTAMP_DISJOINT:
126 case PIPE_QUERY_GPU_FINISHED:
127 break;
128 case SI_QUERY_DRAW_CALLS:
129 query->begin_result = sctx->num_draw_calls;
130 break;
131 case SI_QUERY_DECOMPRESS_CALLS:
132 query->begin_result = sctx->num_decompress_calls;
133 break;
134 case SI_QUERY_PRIM_RESTART_CALLS:
135 query->begin_result = sctx->num_prim_restart_calls;
136 break;
137 case SI_QUERY_COMPUTE_CALLS:
138 query->begin_result = sctx->num_compute_calls;
139 break;
140 case SI_QUERY_CP_DMA_CALLS:
141 query->begin_result = sctx->num_cp_dma_calls;
142 break;
143 case SI_QUERY_NUM_VS_FLUSHES:
144 query->begin_result = sctx->num_vs_flushes;
145 break;
146 case SI_QUERY_NUM_PS_FLUSHES:
147 query->begin_result = sctx->num_ps_flushes;
148 break;
149 case SI_QUERY_NUM_CS_FLUSHES:
150 query->begin_result = sctx->num_cs_flushes;
151 break;
152 case SI_QUERY_NUM_CB_CACHE_FLUSHES:
153 query->begin_result = sctx->num_cb_cache_flushes;
154 break;
155 case SI_QUERY_NUM_DB_CACHE_FLUSHES:
156 query->begin_result = sctx->num_db_cache_flushes;
157 break;
158 case SI_QUERY_NUM_L2_INVALIDATES:
159 query->begin_result = sctx->num_L2_invalidates;
160 break;
161 case SI_QUERY_NUM_L2_WRITEBACKS:
162 query->begin_result = sctx->num_L2_writebacks;
163 break;
164 case SI_QUERY_NUM_RESIDENT_HANDLES:
165 query->begin_result = sctx->num_resident_handles;
166 break;
167 case SI_QUERY_TC_OFFLOADED_SLOTS:
168 query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
169 break;
170 case SI_QUERY_TC_DIRECT_SLOTS:
171 query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
172 break;
173 case SI_QUERY_TC_NUM_SYNCS:
174 query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
175 break;
176 case SI_QUERY_REQUESTED_VRAM:
177 case SI_QUERY_REQUESTED_GTT:
178 case SI_QUERY_MAPPED_VRAM:
179 case SI_QUERY_MAPPED_GTT:
180 case SI_QUERY_SLAB_WASTED_VRAM:
181 case SI_QUERY_SLAB_WASTED_GTT:
182 case SI_QUERY_VRAM_USAGE:
183 case SI_QUERY_VRAM_VIS_USAGE:
184 case SI_QUERY_GTT_USAGE:
185 case SI_QUERY_GPU_TEMPERATURE:
186 case SI_QUERY_CURRENT_GPU_SCLK:
187 case SI_QUERY_CURRENT_GPU_MCLK:
188 case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
189 case SI_QUERY_NUM_MAPPED_BUFFERS:
190 query->begin_result = 0;
191 break;
192 case SI_QUERY_BUFFER_WAIT_TIME:
193 case SI_QUERY_GFX_IB_SIZE:
194 case SI_QUERY_NUM_GFX_IBS:
195 case SI_QUERY_NUM_BYTES_MOVED:
196 case SI_QUERY_NUM_EVICTIONS:
197 case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
198 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
199 query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
200 break;
201 }
202 case SI_QUERY_GFX_BO_LIST_SIZE:
203 ws_id = winsys_id_from_type(query->b.type);
204 query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
205 query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
206 break;
207 case SI_QUERY_CS_THREAD_BUSY:
208 ws_id = winsys_id_from_type(query->b.type);
209 query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
210 query->begin_time = os_time_get_nano();
211 break;
212 case SI_QUERY_GALLIUM_THREAD_BUSY:
213 query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
214 query->begin_time = os_time_get_nano();
215 break;
216 case SI_QUERY_GPU_LOAD:
217 case SI_QUERY_GPU_SHADERS_BUSY:
218 case SI_QUERY_GPU_TA_BUSY:
219 case SI_QUERY_GPU_GDS_BUSY:
220 case SI_QUERY_GPU_VGT_BUSY:
221 case SI_QUERY_GPU_IA_BUSY:
222 case SI_QUERY_GPU_SX_BUSY:
223 case SI_QUERY_GPU_WD_BUSY:
224 case SI_QUERY_GPU_BCI_BUSY:
225 case SI_QUERY_GPU_SC_BUSY:
226 case SI_QUERY_GPU_PA_BUSY:
227 case SI_QUERY_GPU_DB_BUSY:
228 case SI_QUERY_GPU_CP_BUSY:
229 case SI_QUERY_GPU_CB_BUSY:
230 case SI_QUERY_GPU_SDMA_BUSY:
231 case SI_QUERY_GPU_PFP_BUSY:
232 case SI_QUERY_GPU_MEQ_BUSY:
233 case SI_QUERY_GPU_ME_BUSY:
234 case SI_QUERY_GPU_SURF_SYNC_BUSY:
235 case SI_QUERY_GPU_CP_DMA_BUSY:
236 case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
237 query->begin_result = si_begin_counter(sctx->screen, query->b.type);
238 break;
239 case SI_QUERY_NUM_COMPILATIONS:
240 query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
241 break;
242 case SI_QUERY_NUM_SHADERS_CREATED:
243 query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
244 break;
245 case SI_QUERY_LIVE_SHADER_CACHE_HITS:
246 query->begin_result = sctx->screen->live_shader_cache.hits;
247 break;
248 case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
249 query->begin_result = sctx->screen->live_shader_cache.misses;
250 break;
251 case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
252 query->begin_result = sctx->screen->num_memory_shader_cache_hits;
253 break;
254 case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
255 query->begin_result = sctx->screen->num_memory_shader_cache_misses;
256 break;
257 case SI_QUERY_DISK_SHADER_CACHE_HITS:
258 query->begin_result = sctx->screen->num_disk_shader_cache_hits;
259 break;
260 case SI_QUERY_DISK_SHADER_CACHE_MISSES:
261 query->begin_result = sctx->screen->num_disk_shader_cache_misses;
262 break;
263 case SI_QUERY_GPIN_ASIC_ID:
264 case SI_QUERY_GPIN_NUM_SIMD:
265 case SI_QUERY_GPIN_NUM_RB:
266 case SI_QUERY_GPIN_NUM_SPI:
267 case SI_QUERY_GPIN_NUM_SE:
268 break;
269 default:
270 unreachable("si_query_sw_begin: bad query type");
271 }
272
273 return true;
274 }
275
si_query_sw_end(struct si_context * sctx,struct si_query * squery)276 static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
277 {
278 struct si_query_sw *query = (struct si_query_sw *)squery;
279 enum radeon_value_id ws_id;
280
281 switch (query->b.type) {
282 case PIPE_QUERY_TIMESTAMP_DISJOINT:
283 break;
284 case PIPE_QUERY_GPU_FINISHED:
285 sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
286 break;
287 case SI_QUERY_DRAW_CALLS:
288 query->end_result = sctx->num_draw_calls;
289 break;
290 case SI_QUERY_DECOMPRESS_CALLS:
291 query->end_result = sctx->num_decompress_calls;
292 break;
293 case SI_QUERY_PRIM_RESTART_CALLS:
294 query->end_result = sctx->num_prim_restart_calls;
295 break;
296 case SI_QUERY_COMPUTE_CALLS:
297 query->end_result = sctx->num_compute_calls;
298 break;
299 case SI_QUERY_CP_DMA_CALLS:
300 query->end_result = sctx->num_cp_dma_calls;
301 break;
302 case SI_QUERY_NUM_VS_FLUSHES:
303 query->end_result = sctx->num_vs_flushes;
304 break;
305 case SI_QUERY_NUM_PS_FLUSHES:
306 query->end_result = sctx->num_ps_flushes;
307 break;
308 case SI_QUERY_NUM_CS_FLUSHES:
309 query->end_result = sctx->num_cs_flushes;
310 break;
311 case SI_QUERY_NUM_CB_CACHE_FLUSHES:
312 query->end_result = sctx->num_cb_cache_flushes;
313 break;
314 case SI_QUERY_NUM_DB_CACHE_FLUSHES:
315 query->end_result = sctx->num_db_cache_flushes;
316 break;
317 case SI_QUERY_NUM_L2_INVALIDATES:
318 query->end_result = sctx->num_L2_invalidates;
319 break;
320 case SI_QUERY_NUM_L2_WRITEBACKS:
321 query->end_result = sctx->num_L2_writebacks;
322 break;
323 case SI_QUERY_NUM_RESIDENT_HANDLES:
324 query->end_result = sctx->num_resident_handles;
325 break;
326 case SI_QUERY_TC_OFFLOADED_SLOTS:
327 query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
328 break;
329 case SI_QUERY_TC_DIRECT_SLOTS:
330 query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
331 break;
332 case SI_QUERY_TC_NUM_SYNCS:
333 query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
334 break;
335 case SI_QUERY_REQUESTED_VRAM:
336 case SI_QUERY_REQUESTED_GTT:
337 case SI_QUERY_MAPPED_VRAM:
338 case SI_QUERY_MAPPED_GTT:
339 case SI_QUERY_SLAB_WASTED_VRAM:
340 case SI_QUERY_SLAB_WASTED_GTT:
341 case SI_QUERY_VRAM_USAGE:
342 case SI_QUERY_VRAM_VIS_USAGE:
343 case SI_QUERY_GTT_USAGE:
344 case SI_QUERY_GPU_TEMPERATURE:
345 case SI_QUERY_CURRENT_GPU_SCLK:
346 case SI_QUERY_CURRENT_GPU_MCLK:
347 case SI_QUERY_BUFFER_WAIT_TIME:
348 case SI_QUERY_GFX_IB_SIZE:
349 case SI_QUERY_NUM_MAPPED_BUFFERS:
350 case SI_QUERY_NUM_GFX_IBS:
351 case SI_QUERY_NUM_BYTES_MOVED:
352 case SI_QUERY_NUM_EVICTIONS:
353 case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
354 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
355 query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
356 break;
357 }
358 case SI_QUERY_GFX_BO_LIST_SIZE:
359 ws_id = winsys_id_from_type(query->b.type);
360 query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
361 query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
362 break;
363 case SI_QUERY_CS_THREAD_BUSY:
364 ws_id = winsys_id_from_type(query->b.type);
365 query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
366 query->end_time = os_time_get_nano();
367 break;
368 case SI_QUERY_GALLIUM_THREAD_BUSY:
369 query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
370 query->end_time = os_time_get_nano();
371 break;
372 case SI_QUERY_GPU_LOAD:
373 case SI_QUERY_GPU_SHADERS_BUSY:
374 case SI_QUERY_GPU_TA_BUSY:
375 case SI_QUERY_GPU_GDS_BUSY:
376 case SI_QUERY_GPU_VGT_BUSY:
377 case SI_QUERY_GPU_IA_BUSY:
378 case SI_QUERY_GPU_SX_BUSY:
379 case SI_QUERY_GPU_WD_BUSY:
380 case SI_QUERY_GPU_BCI_BUSY:
381 case SI_QUERY_GPU_SC_BUSY:
382 case SI_QUERY_GPU_PA_BUSY:
383 case SI_QUERY_GPU_DB_BUSY:
384 case SI_QUERY_GPU_CP_BUSY:
385 case SI_QUERY_GPU_CB_BUSY:
386 case SI_QUERY_GPU_SDMA_BUSY:
387 case SI_QUERY_GPU_PFP_BUSY:
388 case SI_QUERY_GPU_MEQ_BUSY:
389 case SI_QUERY_GPU_ME_BUSY:
390 case SI_QUERY_GPU_SURF_SYNC_BUSY:
391 case SI_QUERY_GPU_CP_DMA_BUSY:
392 case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
393 query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
394 query->begin_result = 0;
395 break;
396 case SI_QUERY_NUM_COMPILATIONS:
397 query->end_result = p_atomic_read(&sctx->screen->num_compilations);
398 break;
399 case SI_QUERY_NUM_SHADERS_CREATED:
400 query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
401 break;
402 case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
403 query->end_result = sctx->last_tex_ps_draw_ratio;
404 break;
405 case SI_QUERY_LIVE_SHADER_CACHE_HITS:
406 query->end_result = sctx->screen->live_shader_cache.hits;
407 break;
408 case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
409 query->end_result = sctx->screen->live_shader_cache.misses;
410 break;
411 case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
412 query->end_result = sctx->screen->num_memory_shader_cache_hits;
413 break;
414 case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
415 query->end_result = sctx->screen->num_memory_shader_cache_misses;
416 break;
417 case SI_QUERY_DISK_SHADER_CACHE_HITS:
418 query->end_result = sctx->screen->num_disk_shader_cache_hits;
419 break;
420 case SI_QUERY_DISK_SHADER_CACHE_MISSES:
421 query->end_result = sctx->screen->num_disk_shader_cache_misses;
422 break;
423 case SI_QUERY_GPIN_ASIC_ID:
424 case SI_QUERY_GPIN_NUM_SIMD:
425 case SI_QUERY_GPIN_NUM_RB:
426 case SI_QUERY_GPIN_NUM_SPI:
427 case SI_QUERY_GPIN_NUM_SE:
428 break;
429 default:
430 unreachable("si_query_sw_end: bad query type");
431 }
432
433 return true;
434 }
435
si_query_sw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)436 static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
437 union pipe_query_result *result)
438 {
439 struct si_query_sw *query = (struct si_query_sw *)squery;
440
441 switch (query->b.type) {
442 case PIPE_QUERY_TIMESTAMP_DISJOINT:
443 /* Convert from cycles per millisecond to cycles per second (Hz). */
444 result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
445 result->timestamp_disjoint.disjoint = false;
446 return true;
447 case PIPE_QUERY_GPU_FINISHED: {
448 struct pipe_screen *screen = sctx->b.screen;
449 struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
450
451 result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
452 return result->b;
453 }
454
455 case SI_QUERY_GFX_BO_LIST_SIZE:
456 result->u64 =
457 (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
458 return true;
459 case SI_QUERY_CS_THREAD_BUSY:
460 case SI_QUERY_GALLIUM_THREAD_BUSY:
461 result->u64 =
462 (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
463 return true;
464 case SI_QUERY_GPIN_ASIC_ID:
465 result->u32 = 0;
466 return true;
467 case SI_QUERY_GPIN_NUM_SIMD:
468 result->u32 = sctx->screen->info.num_cu;
469 return true;
470 case SI_QUERY_GPIN_NUM_RB:
471 result->u32 = sctx->screen->info.max_render_backends;
472 return true;
473 case SI_QUERY_GPIN_NUM_SPI:
474 result->u32 = 1; /* all supported chips have one SPI per SE */
475 return true;
476 case SI_QUERY_GPIN_NUM_SE:
477 result->u32 = sctx->screen->info.max_se;
478 return true;
479 }
480
481 result->u64 = query->end_result - query->begin_result;
482
483 switch (query->b.type) {
484 case SI_QUERY_BUFFER_WAIT_TIME:
485 case SI_QUERY_GPU_TEMPERATURE:
486 result->u64 /= 1000;
487 break;
488 case SI_QUERY_CURRENT_GPU_SCLK:
489 case SI_QUERY_CURRENT_GPU_MCLK:
490 result->u64 *= 1000000;
491 break;
492 }
493
494 return true;
495 }
496
497 static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
498 .begin = si_query_sw_begin,
499 .end = si_query_sw_end,
500 .get_result = si_query_sw_get_result,
501 .get_result_resource = NULL};
502
si_query_sw_create(unsigned query_type)503 static struct pipe_query *si_query_sw_create(unsigned query_type)
504 {
505 struct si_query_sw *query;
506
507 query = CALLOC_STRUCT(si_query_sw);
508 if (!query)
509 return NULL;
510
511 query->b.type = query_type;
512 query->b.ops = &sw_query_ops;
513
514 return (struct pipe_query *)query;
515 }
516
si_query_buffer_destroy(struct si_screen * sscreen,struct si_query_buffer * buffer)517 void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
518 {
519 struct si_query_buffer *prev = buffer->previous;
520
521 /* Release all query buffers. */
522 while (prev) {
523 struct si_query_buffer *qbuf = prev;
524 prev = prev->previous;
525 si_resource_reference(&qbuf->buf, NULL);
526 FREE(qbuf);
527 }
528
529 si_resource_reference(&buffer->buf, NULL);
530 }
531
si_query_buffer_reset(struct si_context * sctx,struct si_query_buffer * buffer)532 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
533 {
534 /* Discard all query buffers except for the oldest. */
535 while (buffer->previous) {
536 struct si_query_buffer *qbuf = buffer->previous;
537 buffer->previous = qbuf->previous;
538
539 si_resource_reference(&buffer->buf, NULL);
540 buffer->buf = qbuf->buf; /* move ownership */
541 FREE(qbuf);
542 }
543 buffer->results_end = 0;
544
545 if (!buffer->buf)
546 return;
547
548 /* Discard even the oldest buffer if it can't be mapped without a stall. */
549 if (si_cs_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
550 !sctx->ws->buffer_wait(sctx->ws, buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
551 si_resource_reference(&buffer->buf, NULL);
552 } else {
553 buffer->unprepared = true;
554 }
555 }
556
si_query_buffer_alloc(struct si_context * sctx,struct si_query_buffer * buffer,bool (* prepare_buffer)(struct si_context *,struct si_query_buffer *),unsigned size)557 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
558 bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
559 unsigned size)
560 {
561 bool unprepared = buffer->unprepared;
562 buffer->unprepared = false;
563
564 if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
565 if (buffer->buf) {
566 struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
567 memcpy(qbuf, buffer, sizeof(*qbuf));
568 buffer->previous = qbuf;
569 }
570 buffer->results_end = 0;
571
572 /* Queries are normally read by the CPU after
573 * being written by the gpu, hence staging is probably a good
574 * usage pattern.
575 */
576 struct si_screen *screen = sctx->screen;
577 unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
578 buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
579 if (unlikely(!buffer->buf))
580 return false;
581 unprepared = true;
582 }
583
584 if (unprepared && prepare_buffer) {
585 if (unlikely(!prepare_buffer(sctx, buffer))) {
586 si_resource_reference(&buffer->buf, NULL);
587 return false;
588 }
589 }
590
591 return true;
592 }
593
si_query_hw_destroy(struct si_context * sctx,struct si_query * squery)594 void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
595 {
596 struct si_query_hw *query = (struct si_query_hw *)squery;
597
598 si_query_buffer_destroy(sctx->screen, &query->buffer);
599 si_resource_reference(&query->workaround_buf, NULL);
600 FREE(squery);
601 }
602
si_query_hw_prepare_buffer(struct si_context * sctx,struct si_query_buffer * qbuf)603 static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
604 {
605 struct si_query_hw *query = container_of(qbuf, struct si_query_hw, buffer);
606 struct si_screen *screen = sctx->screen;
607
608 /* The caller ensures that the buffer is currently unused by the GPU. */
609 uint32_t *results = screen->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
610 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
611 if (!results)
612 return false;
613
614 memset(results, 0, qbuf->buf->b.b.width0);
615
616 if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
617 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
618 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
619 unsigned max_rbs = screen->info.max_render_backends;
620 unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
621 unsigned num_results;
622 unsigned i, j;
623
624 /* Set top bits for unused backends. */
625 num_results = qbuf->buf->b.b.width0 / query->result_size;
626 for (j = 0; j < num_results; j++) {
627 for (i = 0; i < max_rbs; i++) {
628 if (!(enabled_rb_mask & (1 << i))) {
629 results[(i * 4) + 1] = 0x80000000;
630 results[(i * 4) + 3] = 0x80000000;
631 }
632 }
633 results += 4 * max_rbs;
634 }
635 }
636
637 return true;
638 }
639
si_query_pipestats_num_results(struct si_screen * sscreen)640 static unsigned si_query_pipestats_num_results(struct si_screen *sscreen)
641 {
642 return sscreen->info.gfx_level >= GFX11 ? 14 : 11;
643 }
644
si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)645 static unsigned si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)
646 {
647 switch (index) {
648 case PIPE_STAT_QUERY_PS_INVOCATIONS: return 0;
649 case PIPE_STAT_QUERY_C_PRIMITIVES: return 2;
650 case PIPE_STAT_QUERY_C_INVOCATIONS: return 4;
651 case PIPE_STAT_QUERY_VS_INVOCATIONS: return 6;
652 case PIPE_STAT_QUERY_GS_INVOCATIONS: return 8;
653 case PIPE_STAT_QUERY_GS_PRIMITIVES: return 10;
654 case PIPE_STAT_QUERY_IA_PRIMITIVES: return 12;
655 case PIPE_STAT_QUERY_IA_VERTICES: return 14;
656 case PIPE_STAT_QUERY_HS_INVOCATIONS: return 16;
657 case PIPE_STAT_QUERY_DS_INVOCATIONS: return 18;
658 case PIPE_STAT_QUERY_CS_INVOCATIONS: return 20;
659 /* gfx11: MS_INVOCATIONS */
660 /* gfx11: MS_PRIMITIVES */
661 /* gfx11: TS_INVOCATIONS */
662 default:
663 assert(false);
664 }
665 return ~0;
666 }
667
si_query_pipestat_end_dw_offset(struct si_screen * sscreen,enum pipe_statistics_query_index index)668 unsigned si_query_pipestat_end_dw_offset(struct si_screen *sscreen,
669 enum pipe_statistics_query_index index)
670 {
671 return si_query_pipestats_num_results(sscreen) * 2 + si_query_pipestat_dw_offset(index);
672 }
673
674 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
675 enum pipe_query_flags flags,
676 enum pipe_query_value_type result_type,
677 int index, struct pipe_resource *resource,
678 unsigned offset);
679
680 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
681 struct si_resource *buffer, uint64_t va);
682 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
683 struct si_resource *buffer, uint64_t va);
684 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
685 union pipe_query_result *result);
686 static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
687
688 static struct si_query_hw_ops query_hw_default_hw_ops = {
689 .prepare_buffer = si_query_hw_prepare_buffer,
690 .emit_start = si_query_hw_do_emit_start,
691 .emit_stop = si_query_hw_do_emit_stop,
692 .clear_result = si_query_hw_clear_result,
693 .add_result = si_query_hw_add_result,
694 };
695
si_query_hw_create(struct si_screen * sscreen,unsigned query_type,unsigned index)696 static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
697 unsigned index)
698 {
699 struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
700 if (!query)
701 return NULL;
702
703 query->b.type = query_type;
704 query->b.ops = &query_hw_ops;
705 query->ops = &query_hw_default_hw_ops;
706
707 switch (query_type) {
708 case PIPE_QUERY_OCCLUSION_COUNTER:
709 case PIPE_QUERY_OCCLUSION_PREDICATE:
710 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
711 query->result_size = 16 * sscreen->info.max_render_backends;
712 query->result_size += 16; /* for the fence + alignment */
713 query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
714 break;
715 case PIPE_QUERY_TIME_ELAPSED:
716 query->result_size = 24;
717 query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
718 break;
719 case PIPE_QUERY_TIMESTAMP:
720 query->result_size = 16;
721 query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
722 query->flags = SI_QUERY_HW_FLAG_NO_START;
723 break;
724 case PIPE_QUERY_PRIMITIVES_EMITTED:
725 case PIPE_QUERY_PRIMITIVES_GENERATED:
726 case PIPE_QUERY_SO_STATISTICS:
727 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
728 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
729 query->result_size = 32;
730 query->b.num_cs_dw_suspend = 6;
731 query->stream = index;
732 break;
733 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
734 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
735 query->result_size = 32 * SI_MAX_STREAMS;
736 query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
737 break;
738 case PIPE_QUERY_PIPELINE_STATISTICS:
739 query->result_size = si_query_pipestats_num_results(sscreen) * 16;
740 query->result_size += 8; /* for the fence + alignment */
741 query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
742 query->index = index;
743 if ((index == PIPE_STAT_QUERY_GS_PRIMITIVES || index == PIPE_STAT_QUERY_GS_INVOCATIONS) &&
744 sscreen->use_ngg && (sscreen->info.gfx_level >= GFX10 && sscreen->info.gfx_level <= GFX10_3))
745 query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
746 break;
747 default:
748 assert(0);
749 FREE(query);
750 return NULL;
751 }
752
753 return (struct pipe_query *)query;
754 }
755
si_update_occlusion_query_state(struct si_context * sctx,unsigned type,int diff)756 static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
757 {
758 if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
759 type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
760 bool old_enable = sctx->num_occlusion_queries != 0;
761 bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
762 bool enable, perfect_enable;
763
764 sctx->num_occlusion_queries += diff;
765 assert(sctx->num_occlusion_queries >= 0);
766
767 if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
768 sctx->num_perfect_occlusion_queries += diff;
769 assert(sctx->num_perfect_occlusion_queries >= 0);
770 }
771
772 enable = sctx->num_occlusion_queries != 0;
773 perfect_enable = sctx->num_perfect_occlusion_queries != 0;
774
775 if (enable != old_enable || perfect_enable != old_perfect_enable) {
776 si_set_occlusion_query_state(sctx, old_perfect_enable);
777 }
778 }
779 }
780
event_type_for_stream(unsigned stream)781 static unsigned event_type_for_stream(unsigned stream)
782 {
783 switch (stream) {
784 default:
785 case 0:
786 return V_028A90_SAMPLE_STREAMOUTSTATS;
787 case 1:
788 return V_028A90_SAMPLE_STREAMOUTSTATS1;
789 case 2:
790 return V_028A90_SAMPLE_STREAMOUTSTATS2;
791 case 3:
792 return V_028A90_SAMPLE_STREAMOUTSTATS3;
793 }
794 }
795
emit_sample_streamout(struct radeon_cmdbuf * cs,uint64_t va,unsigned stream)796 static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
797 {
798 radeon_begin(cs);
799 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
800 radeon_emit(EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
801 radeon_emit(va);
802 radeon_emit(va >> 32);
803 radeon_end();
804 }
805
si_query_hw_do_emit_start(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)806 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
807 struct si_resource *buffer, uint64_t va)
808 {
809 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
810
811 switch (query->b.type) {
812 case PIPE_QUERY_OCCLUSION_COUNTER:
813 case PIPE_QUERY_OCCLUSION_PREDICATE:
814 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
815 radeon_begin(cs);
816 if (sctx->gfx_level >= GFX11) {
817 uint64_t rb_mask = BITFIELD64_MASK(sctx->screen->info.max_render_backends);
818
819 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
820 radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1));
821 radeon_emit(PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) |
822 PIXEL_PIPE_STATE_CNTL_STRIDE(2) |
823 PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask));
824 radeon_emit(PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask));
825 }
826
827 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
828 if (sctx->gfx_level >= GFX11)
829 radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
830 else
831 radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
832 radeon_emit(va);
833 radeon_emit(va >> 32);
834 radeon_end();
835 break;
836 }
837 case PIPE_QUERY_PRIMITIVES_EMITTED:
838 case PIPE_QUERY_PRIMITIVES_GENERATED:
839 case PIPE_QUERY_SO_STATISTICS:
840 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
841 emit_sample_streamout(cs, va, query->stream);
842 break;
843 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
844 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
845 emit_sample_streamout(cs, va + 32 * stream, stream);
846 break;
847 case PIPE_QUERY_TIME_ELAPSED:
848 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
849 EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
850 break;
851 case PIPE_QUERY_PIPELINE_STATISTICS: {
852 if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
853 /* The hw GS primitive counter doesn't work when ngg is active.
854 * So if use_ngg is true, we don't use the hw version but instead
855 * emulate it in the GS shader.
856 * The value is written at the same position, so we don't need to
857 * change anything else.
858 * If ngg is enabled for the draw, the primitive count is written in
859 * gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of exported
860 * vertices is stored in gs_emitted_vertices and the number of prim
861 * is computed based on the output prim type in emit_gs_epilogue.
862 */
863 struct pipe_shader_buffer sbuf;
864 sbuf.buffer = &buffer->b.b;
865 sbuf.buffer_offset = query->buffer.results_end;
866 sbuf.buffer_size = buffer->bo_size;
867 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf);
868 SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 1);
869
870 const uint32_t zero = 0;
871 radeon_begin(cs);
872 /* Clear the emulated counter end value. We don't clear start because it's unused. */
873 va += si_query_pipestat_end_dw_offset(sctx->screen, query->index) * 4;
874 radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0));
875 radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
876 radeon_emit(va);
877 radeon_emit(va >> 32);
878 radeon_emit(zero);
879 radeon_end();
880
881 sctx->num_pipeline_stat_emulated_queries++;
882 } else {
883 radeon_begin(cs);
884 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
885 radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
886 radeon_emit(va);
887 radeon_emit(va >> 32);
888 radeon_end();
889 }
890 break;
891 }
892 default:
893 assert(0);
894 }
895 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
896 RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
897 }
898
si_query_hw_emit_start(struct si_context * sctx,struct si_query_hw * query)899 static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
900 {
901 uint64_t va;
902
903 if (!query->buffer.buf && query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
904 si_resource_reference(&query->buffer.buf, sctx->pipeline_stats_query_buf);
905
906 /* Don't realloc pipeline_stats_query_buf */
907 if ((!(query->flags & SI_QUERY_EMULATE_GS_COUNTERS) || !sctx->pipeline_stats_query_buf) &&
908 !si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
909 return;
910
911 if (query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
912 si_resource_reference(&sctx->pipeline_stats_query_buf, query->buffer.buf);
913
914 si_update_occlusion_query_state(sctx, query->b.type, 1);
915 si_update_prims_generated_query_state(sctx, query->b.type, 1);
916
917 if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
918 sctx->num_pipeline_stat_queries++;
919
920 si_need_gfx_cs_space(sctx, 0);
921
922 va = query->buffer.buf->gpu_address + query->buffer.results_end;
923 query->ops->emit_start(sctx, query, query->buffer.buf, va);
924 }
925
si_query_hw_do_emit_stop(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)926 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
927 struct si_resource *buffer, uint64_t va)
928 {
929 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
930 uint64_t fence_va = 0;
931
932 switch (query->b.type) {
933 case PIPE_QUERY_OCCLUSION_COUNTER:
934 case PIPE_QUERY_OCCLUSION_PREDICATE:
935 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
936 va += 8;
937 radeon_begin(cs);
938 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
939 if (sctx->gfx_level >= GFX11)
940 radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
941 else
942 radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
943 radeon_emit(va);
944 radeon_emit(va >> 32);
945 radeon_end();
946
947 fence_va = va + sctx->screen->info.max_render_backends * 16 - 8;
948 break;
949 }
950 case PIPE_QUERY_PRIMITIVES_EMITTED:
951 case PIPE_QUERY_PRIMITIVES_GENERATED:
952 case PIPE_QUERY_SO_STATISTICS:
953 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
954 va += 16;
955 emit_sample_streamout(cs, va, query->stream);
956 break;
957 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
958 va += 16;
959 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
960 emit_sample_streamout(cs, va + 32 * stream, stream);
961 break;
962 case PIPE_QUERY_TIME_ELAPSED:
963 va += 8;
964 FALLTHROUGH;
965 case PIPE_QUERY_TIMESTAMP:
966 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
967 EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
968 fence_va = va + 8;
969 break;
970 case PIPE_QUERY_PIPELINE_STATISTICS: {
971 unsigned sample_size = (query->result_size - 8) / 2;
972
973 va += sample_size;
974
975 radeon_begin(cs);
976 if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
977 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
978 radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
979
980 if (--sctx->num_pipeline_stat_emulated_queries == 0) {
981 si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
982 SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 0);
983 }
984 } else {
985 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
986 radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
987 radeon_emit(va);
988 radeon_emit(va >> 32);
989 }
990 radeon_end();
991
992 fence_va = va + sample_size;
993 break;
994 }
995 default:
996 assert(0);
997 }
998 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
999 RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
1000
1001 if (fence_va) {
1002 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
1003 EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
1004 query->b.type);
1005 }
1006 }
1007
si_query_hw_emit_stop(struct si_context * sctx,struct si_query_hw * query)1008 static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
1009 {
1010 uint64_t va;
1011
1012 /* The queries which need begin already called this in begin_query. */
1013 if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1014 si_need_gfx_cs_space(sctx, 0);
1015 if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
1016 query->result_size))
1017 return;
1018 }
1019
1020 if (!query->buffer.buf)
1021 return; // previous buffer allocation failure
1022
1023 /* emit end query */
1024 va = query->buffer.buf->gpu_address + query->buffer.results_end;
1025
1026 query->ops->emit_stop(sctx, query, query->buffer.buf, va);
1027
1028 query->buffer.results_end += query->result_size;
1029
1030 si_update_occlusion_query_state(sctx, query->b.type, -1);
1031 si_update_prims_generated_query_state(sctx, query->b.type, -1);
1032
1033 if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
1034 sctx->num_pipeline_stat_queries--;
1035 }
1036
emit_set_predicate(struct si_context * ctx,struct si_resource * buf,uint64_t va,uint32_t op)1037 static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
1038 uint32_t op)
1039 {
1040 struct radeon_cmdbuf *cs = &ctx->gfx_cs;
1041
1042 radeon_begin(cs);
1043
1044 if (ctx->gfx_level >= GFX9) {
1045 radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
1046 radeon_emit(op);
1047 radeon_emit(va);
1048 radeon_emit(va >> 32);
1049 } else {
1050 radeon_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
1051 radeon_emit(va);
1052 radeon_emit(op | ((va >> 32) & 0xFF));
1053 }
1054 radeon_end();
1055
1056 radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ | RADEON_PRIO_QUERY);
1057 }
1058
si_emit_query_predication(struct si_context * ctx)1059 static void si_emit_query_predication(struct si_context *ctx)
1060 {
1061 uint32_t op;
1062 bool flag_wait, invert;
1063
1064 struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
1065 if (!query)
1066 return;
1067
1068 invert = ctx->render_cond_invert;
1069 flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
1070 ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
1071
1072 if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1073 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
1074 struct gfx10_sh_query *gfx10_query = (struct gfx10_sh_query *)query;
1075 struct gfx10_sh_query_buffer *qbuf, *first, *last;
1076
1077 op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1078
1079 /* if true then invert, see GL_ARB_conditional_render_inverted */
1080 if (!invert)
1081 op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1082 else
1083 op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1084
1085 op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1086
1087 first = gfx10_query->first;
1088 last = gfx10_query->last;
1089
1090 while (first) {
1091 qbuf = first;
1092 if (first != last)
1093 first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list);
1094 else
1095 first = NULL;
1096
1097 unsigned results_base = gfx10_query->first_begin;
1098 uint64_t va_base = qbuf->buf->gpu_address;
1099 uint64_t va = va_base + results_base;
1100
1101 unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
1102 unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
1103
1104 unsigned count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
1105 do {
1106 if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1107 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1108 emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
1109
1110 /* set CONTINUE bit for all packets except the first */
1111 op |= PREDICATION_CONTINUE;
1112 }
1113 } else {
1114 emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
1115 op |= PREDICATION_CONTINUE;
1116 }
1117
1118 results_base += sizeof(struct gfx10_sh_query_buffer_mem);
1119 } while (count--);
1120 }
1121 } else {
1122 struct si_query_buffer *qbuf;
1123
1124 if (query->workaround_buf) {
1125 op = PRED_OP(PREDICATION_OP_BOOL64);
1126 } else {
1127 switch (query->b.type) {
1128 case PIPE_QUERY_OCCLUSION_COUNTER:
1129 case PIPE_QUERY_OCCLUSION_PREDICATE:
1130 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1131 op = PRED_OP(PREDICATION_OP_ZPASS);
1132 break;
1133 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1134 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1135 op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1136 invert = !invert;
1137 break;
1138 default:
1139 assert(0);
1140 return;
1141 }
1142 }
1143
1144 /* if true then invert, see GL_ARB_conditional_render_inverted */
1145 if (invert)
1146 op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1147 else
1148 op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1149
1150 /* Use the value written by compute shader as a workaround. Note that
1151 * the wait flag does not apply in this predication mode.
1152 *
1153 * The shader outputs the result value to L2. Workarounds only affect GFX8
1154 * and later, where the CP reads data from L2, so we don't need an
1155 * additional flush.
1156 */
1157 if (query->workaround_buf) {
1158 uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1159 emit_set_predicate(ctx, query->workaround_buf, va, op);
1160 return;
1161 }
1162
1163 op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1164
1165 /* emit predicate packets for all data blocks */
1166 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1167 unsigned results_base = 0;
1168 uint64_t va_base = qbuf->buf->gpu_address;
1169
1170 while (results_base < qbuf->results_end) {
1171 uint64_t va = va_base + results_base;
1172
1173 if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1174 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1175 emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1176
1177 /* set CONTINUE bit for all packets except the first */
1178 op |= PREDICATION_CONTINUE;
1179 }
1180 } else {
1181 emit_set_predicate(ctx, qbuf->buf, va, op);
1182 op |= PREDICATION_CONTINUE;
1183 }
1184
1185 results_base += query->result_size;
1186 }
1187 }
1188 }
1189 }
1190
si_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)1191 static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1192 unsigned index)
1193 {
1194 struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1195
1196 if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1197 (query_type >= PIPE_QUERY_DRIVER_SPECIFIC))
1198 return si_query_sw_create(query_type);
1199
1200 if (sscreen->use_ngg_streamout &&
1201 (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1202 query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1203 query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1204 query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1205 return gfx10_sh_query_create(sscreen, query_type, index);
1206
1207 return si_query_hw_create(sscreen, query_type, index);
1208 }
1209
si_destroy_query(struct pipe_context * ctx,struct pipe_query * query)1210 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1211 {
1212 struct si_context *sctx = (struct si_context *)ctx;
1213 struct si_query *squery = (struct si_query *)query;
1214
1215 squery->ops->destroy(sctx, squery);
1216 }
1217
si_begin_query(struct pipe_context * ctx,struct pipe_query * query)1218 static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1219 {
1220 struct si_context *sctx = (struct si_context *)ctx;
1221 struct si_query *squery = (struct si_query *)query;
1222
1223 return squery->ops->begin(sctx, squery);
1224 }
1225
si_query_hw_begin(struct si_context * sctx,struct si_query * squery)1226 bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1227 {
1228 struct si_query_hw *query = (struct si_query_hw *)squery;
1229
1230 if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1231 assert(0);
1232 return false;
1233 }
1234
1235 if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1236 si_query_buffer_reset(sctx, &query->buffer);
1237
1238 si_resource_reference(&query->workaround_buf, NULL);
1239
1240 si_query_hw_emit_start(sctx, query);
1241 if (!query->buffer.buf)
1242 return false;
1243
1244 list_addtail(&query->b.active_list, &sctx->active_queries);
1245 sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1246 return true;
1247 }
1248
si_end_query(struct pipe_context * ctx,struct pipe_query * query)1249 static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1250 {
1251 struct si_context *sctx = (struct si_context *)ctx;
1252 struct si_query *squery = (struct si_query *)query;
1253
1254 return squery->ops->end(sctx, squery);
1255 }
1256
si_query_hw_end(struct si_context * sctx,struct si_query * squery)1257 bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1258 {
1259 struct si_query_hw *query = (struct si_query_hw *)squery;
1260
1261 if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1262 si_query_buffer_reset(sctx, &query->buffer);
1263
1264 si_query_hw_emit_stop(sctx, query);
1265
1266 if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1267 list_delinit(&query->b.active_list);
1268 sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1269 }
1270
1271 if (!query->buffer.buf)
1272 return false;
1273
1274 return true;
1275 }
1276
si_get_hw_query_params(struct si_context * sctx,struct si_query_hw * squery,int index,struct si_hw_query_params * params)1277 static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
1278 struct si_hw_query_params *params)
1279 {
1280 unsigned max_rbs = sctx->screen->info.max_render_backends;
1281
1282 params->pair_stride = 0;
1283 params->pair_count = 1;
1284
1285 switch (squery->b.type) {
1286 case PIPE_QUERY_OCCLUSION_COUNTER:
1287 case PIPE_QUERY_OCCLUSION_PREDICATE:
1288 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1289 params->start_offset = 0;
1290 params->end_offset = 8;
1291 params->fence_offset = max_rbs * 16;
1292 params->pair_stride = 16;
1293 params->pair_count = max_rbs;
1294 break;
1295 case PIPE_QUERY_TIME_ELAPSED:
1296 params->start_offset = 0;
1297 params->end_offset = 8;
1298 params->fence_offset = 16;
1299 break;
1300 case PIPE_QUERY_TIMESTAMP:
1301 params->start_offset = 0;
1302 params->end_offset = 0;
1303 params->fence_offset = 8;
1304 break;
1305 case PIPE_QUERY_PRIMITIVES_EMITTED:
1306 params->start_offset = 8;
1307 params->end_offset = 24;
1308 params->fence_offset = params->end_offset + 4;
1309 break;
1310 case PIPE_QUERY_PRIMITIVES_GENERATED:
1311 params->start_offset = 0;
1312 params->end_offset = 16;
1313 params->fence_offset = params->end_offset + 4;
1314 break;
1315 case PIPE_QUERY_SO_STATISTICS:
1316 params->start_offset = 8 - index * 8;
1317 params->end_offset = 24 - index * 8;
1318 params->fence_offset = params->end_offset + 4;
1319 break;
1320 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1321 params->pair_count = SI_MAX_STREAMS;
1322 params->pair_stride = 32;
1323 FALLTHROUGH;
1324 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1325 params->start_offset = 0;
1326 params->end_offset = 16;
1327
1328 /* We can re-use the high dword of the last 64-bit value as a
1329 * fence: it is initialized as 0, and the high bit is set by
1330 * the write of the streamout stats event.
1331 */
1332 params->fence_offset = squery->result_size - 4;
1333 break;
1334 case PIPE_QUERY_PIPELINE_STATISTICS: {
1335 params->start_offset = si_query_pipestat_dw_offset(index) * 4;
1336 params->end_offset = si_query_pipestat_end_dw_offset(sctx->screen, index) * 4;
1337 params->fence_offset = si_query_pipestats_num_results(sctx->screen) * 16;
1338 break;
1339 }
1340 default:
1341 unreachable("si_get_hw_query_params unsupported");
1342 }
1343 }
1344
si_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1345 static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1346 bool test_status_bit)
1347 {
1348 uint32_t *current_result = (uint32_t *)map;
1349 uint64_t start, end;
1350
1351 start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1352 end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1353
1354 if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1355 return end - start;
1356 }
1357 return 0;
1358 }
1359
si_query_hw_add_result(struct si_screen * sscreen,struct si_query_hw * query,void * buffer,union pipe_query_result * result)1360 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1361 void *buffer, union pipe_query_result *result)
1362 {
1363 unsigned max_rbs = sscreen->info.max_render_backends;
1364
1365 switch (query->b.type) {
1366 case PIPE_QUERY_OCCLUSION_COUNTER: {
1367 for (unsigned i = 0; i < max_rbs; ++i) {
1368 unsigned results_base = i * 16;
1369 result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1370 }
1371 break;
1372 }
1373 case PIPE_QUERY_OCCLUSION_PREDICATE:
1374 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1375 for (unsigned i = 0; i < max_rbs; ++i) {
1376 unsigned results_base = i * 16;
1377 result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1378 }
1379 break;
1380 }
1381 case PIPE_QUERY_TIME_ELAPSED:
1382 result->u64 += si_query_read_result(buffer, 0, 2, false);
1383 break;
1384 case PIPE_QUERY_TIMESTAMP:
1385 result->u64 = *(uint64_t *)buffer;
1386 break;
1387 case PIPE_QUERY_PRIMITIVES_EMITTED:
1388 /* SAMPLE_STREAMOUTSTATS stores this structure:
1389 * {
1390 * u64 NumPrimitivesWritten;
1391 * u64 PrimitiveStorageNeeded;
1392 * }
1393 * We only need NumPrimitivesWritten here. */
1394 result->u64 += si_query_read_result(buffer, 2, 6, true);
1395 break;
1396 case PIPE_QUERY_PRIMITIVES_GENERATED:
1397 /* Here we read PrimitiveStorageNeeded. */
1398 result->u64 += si_query_read_result(buffer, 0, 4, true);
1399 break;
1400 case PIPE_QUERY_SO_STATISTICS:
1401 result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1402 result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1403 break;
1404 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1405 result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1406 si_query_read_result(buffer, 0, 4, true);
1407 break;
1408 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1409 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1410 result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1411 si_query_read_result(buffer, 0, 4, true);
1412 buffer = (char *)buffer + 32;
1413 }
1414 break;
1415 case PIPE_QUERY_PIPELINE_STATISTICS:
1416 for (int i = 0; i < 11; i++) {
1417 result->pipeline_statistics.counters[i] +=
1418 si_query_read_result(buffer, si_query_pipestat_dw_offset(i),
1419 si_query_pipestat_end_dw_offset(sscreen, i), false);
1420 }
1421 #if 0 /* for testing */
1422 printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1423 "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1424 "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1425 result->pipeline_statistics.ia_vertices,
1426 result->pipeline_statistics.ia_primitives,
1427 result->pipeline_statistics.vs_invocations,
1428 result->pipeline_statistics.hs_invocations,
1429 result->pipeline_statistics.ds_invocations,
1430 result->pipeline_statistics.gs_invocations,
1431 result->pipeline_statistics.gs_primitives,
1432 result->pipeline_statistics.c_invocations,
1433 result->pipeline_statistics.c_primitives,
1434 result->pipeline_statistics.ps_invocations,
1435 result->pipeline_statistics.cs_invocations);
1436 #endif
1437 break;
1438 default:
1439 assert(0);
1440 }
1441 }
1442
si_query_hw_suspend(struct si_context * sctx,struct si_query * query)1443 void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1444 {
1445 si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1446 }
1447
si_query_hw_resume(struct si_context * sctx,struct si_query * query)1448 void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1449 {
1450 si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1451 }
1452
1453 static const struct si_query_ops query_hw_ops = {
1454 .destroy = si_query_hw_destroy,
1455 .begin = si_query_hw_begin,
1456 .end = si_query_hw_end,
1457 .get_result = si_query_hw_get_result,
1458 .get_result_resource = si_query_hw_get_result_resource,
1459
1460 .suspend = si_query_hw_suspend,
1461 .resume = si_query_hw_resume,
1462 };
1463
si_get_query_result(struct pipe_context * ctx,struct pipe_query * query,bool wait,union pipe_query_result * result)1464 static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1465 union pipe_query_result *result)
1466 {
1467 struct si_context *sctx = (struct si_context *)ctx;
1468 struct si_query *squery = (struct si_query *)query;
1469
1470 return squery->ops->get_result(sctx, squery, wait, result);
1471 }
1472
si_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1473 static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1474 enum pipe_query_flags flags, enum pipe_query_value_type result_type,
1475 int index, struct pipe_resource *resource, unsigned offset)
1476 {
1477 struct si_context *sctx = (struct si_context *)ctx;
1478 struct si_query *squery = (struct si_query *)query;
1479
1480 squery->ops->get_result_resource(sctx, squery, flags, result_type, index, resource, offset);
1481 }
1482
si_query_hw_clear_result(struct si_query_hw * query,union pipe_query_result * result)1483 static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1484 {
1485 util_query_clear_result(result, query->b.type);
1486 }
1487
si_query_hw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1488 bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1489 union pipe_query_result *result)
1490 {
1491 struct si_screen *sscreen = sctx->screen;
1492 struct si_query_hw *query = (struct si_query_hw *)squery;
1493 struct si_query_buffer *qbuf;
1494
1495 query->ops->clear_result(query, result);
1496
1497 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1498 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1499 unsigned results_base = 0;
1500 void *map;
1501
1502 if (squery->b.flushed)
1503 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
1504 else
1505 map = si_buffer_map(sctx, qbuf->buf, usage);
1506
1507 if (!map)
1508 return false;
1509
1510 while (results_base != qbuf->results_end) {
1511 query->ops->add_result(sscreen, query, map + results_base, result);
1512 results_base += query->result_size;
1513 }
1514 }
1515
1516 /* Convert the time to expected units. */
1517 if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
1518 squery->type == PIPE_QUERY_TIMESTAMP) {
1519 result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1520 }
1521 return true;
1522 }
1523
si_query_hw_get_result_resource(struct si_context * sctx,struct si_query * squery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1524 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1525 enum pipe_query_flags flags,
1526 enum pipe_query_value_type result_type,
1527 int index, struct pipe_resource *resource,
1528 unsigned offset)
1529 {
1530 struct si_query_hw *query = (struct si_query_hw *)squery;
1531 struct si_query_buffer *qbuf;
1532 struct si_query_buffer *qbuf_prev;
1533 struct pipe_resource *tmp_buffer = NULL;
1534 unsigned tmp_buffer_offset = 0;
1535 struct si_qbo_state saved_state = {};
1536 struct pipe_grid_info grid = {};
1537 struct pipe_constant_buffer constant_buffer = {};
1538 struct pipe_shader_buffer ssbo[3];
1539 struct si_hw_query_params params;
1540 struct {
1541 uint32_t end_offset;
1542 uint32_t result_stride;
1543 uint32_t result_count;
1544 uint32_t config;
1545 uint32_t fence_offset;
1546 uint32_t pair_stride;
1547 uint32_t pair_count;
1548 } consts;
1549
1550 if (!sctx->query_result_shader) {
1551 sctx->query_result_shader = si_create_query_result_cs(sctx);
1552 if (!sctx->query_result_shader)
1553 return;
1554 }
1555
1556 if (query->buffer.previous) {
1557 u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1558 if (!tmp_buffer)
1559 return;
1560 }
1561
1562 si_save_qbo_state(sctx, &saved_state);
1563
1564 si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, ¶ms);
1565 consts.end_offset = params.end_offset - params.start_offset;
1566 consts.fence_offset = params.fence_offset - params.start_offset;
1567 consts.result_stride = query->result_size;
1568 consts.pair_stride = params.pair_stride;
1569 consts.pair_count = params.pair_count;
1570
1571 constant_buffer.buffer_size = sizeof(consts);
1572 constant_buffer.user_buffer = &consts;
1573
1574 ssbo[1].buffer = tmp_buffer;
1575 ssbo[1].buffer_offset = tmp_buffer_offset;
1576 ssbo[1].buffer_size = 16;
1577
1578 ssbo[2] = ssbo[1];
1579
1580 grid.block[0] = 1;
1581 grid.block[1] = 1;
1582 grid.block[2] = 1;
1583 grid.grid[0] = 1;
1584 grid.grid[1] = 1;
1585 grid.grid[2] = 1;
1586
1587 consts.config = 0;
1588 if (index < 0)
1589 consts.config |= 4;
1590 if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1591 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1592 consts.config |= 8;
1593 else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1594 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1595 consts.config |= 8 | 256;
1596 else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1597 consts.config |= 32;
1598
1599 switch (result_type) {
1600 case PIPE_QUERY_TYPE_U64:
1601 case PIPE_QUERY_TYPE_I64:
1602 consts.config |= 64;
1603 break;
1604 case PIPE_QUERY_TYPE_I32:
1605 consts.config |= 128;
1606 break;
1607 case PIPE_QUERY_TYPE_U32:
1608 break;
1609 }
1610
1611 sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
1612
1613 for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1614 if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1615 qbuf_prev = qbuf->previous;
1616 consts.result_count = qbuf->results_end / query->result_size;
1617 consts.config &= ~3;
1618 if (qbuf != &query->buffer)
1619 consts.config |= 1;
1620 if (qbuf->previous)
1621 consts.config |= 2;
1622 } else {
1623 /* Only read the last timestamp. */
1624 qbuf_prev = NULL;
1625 consts.result_count = 0;
1626 consts.config |= 16;
1627 params.start_offset += qbuf->results_end - query->result_size;
1628 }
1629
1630 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
1631
1632 ssbo[0].buffer = &qbuf->buf->b.b;
1633 ssbo[0].buffer_offset = params.start_offset;
1634 ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1635
1636 if (!qbuf->previous) {
1637 ssbo[2].buffer = resource;
1638 ssbo[2].buffer_offset = offset;
1639 ssbo[2].buffer_size = resource->width0 - offset;
1640 /* assert size is correct, based on result_type ? */
1641
1642 si_resource(resource)->TC_L2_dirty = true;
1643 }
1644
1645 if ((flags & PIPE_QUERY_WAIT) && qbuf == &query->buffer) {
1646 uint64_t va;
1647
1648 /* Wait for result availability. Wait only for readiness
1649 * of the last entry, since the fence writes should be
1650 * serialized in the CP.
1651 */
1652 va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1653 va += params.fence_offset;
1654
1655 si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1656 }
1657 si_launch_grid_internal_ssbos(sctx, &grid, sctx->query_result_shader,
1658 SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
1659 3, ssbo, 0x4);
1660 }
1661
1662 si_restore_qbo_state(sctx, &saved_state);
1663 pipe_resource_reference(&tmp_buffer, NULL);
1664 }
1665
si_render_condition(struct pipe_context * ctx,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1666 static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1667 enum pipe_render_cond_flag mode)
1668 {
1669 struct si_context *sctx = (struct si_context *)ctx;
1670 struct si_query_hw *squery = (struct si_query_hw *)query;
1671 struct si_atom *atom = &sctx->atoms.s.render_cond;
1672
1673 if (query) {
1674 bool needs_workaround = false;
1675
1676 /* There was a firmware regression in GFX8 which causes successive
1677 * SET_PREDICATION packets to give the wrong answer for
1678 * non-inverted stream overflow predication.
1679 */
1680 if (((sctx->gfx_level == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1681 (sctx->gfx_level == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1682 !condition &&
1683 (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1684 (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1685 (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1686 needs_workaround = true;
1687 }
1688
1689 if (needs_workaround && !squery->workaround_buf) {
1690 bool old_render_cond_enabled = sctx->render_cond_enabled;
1691 sctx->render_cond_enabled = false;
1692
1693 u_suballocator_alloc(&sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1694 (struct pipe_resource **)&squery->workaround_buf);
1695
1696 /* Reset to NULL to avoid a redundant SET_PREDICATION
1697 * from launching the compute grid.
1698 */
1699 sctx->render_cond = NULL;
1700
1701 ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1702 &squery->workaround_buf->b.b, squery->workaround_offset);
1703
1704 /* Settings this in the render cond atom is too late,
1705 * so set it here. */
1706 sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
1707
1708 sctx->render_cond_enabled = old_render_cond_enabled;
1709 }
1710 }
1711
1712 sctx->render_cond = query;
1713 sctx->render_cond_invert = condition;
1714 sctx->render_cond_mode = mode;
1715 sctx->render_cond_enabled = query;
1716
1717 si_set_atom_dirty(sctx, atom, query != NULL);
1718 }
1719
si_suspend_queries(struct si_context * sctx)1720 void si_suspend_queries(struct si_context *sctx)
1721 {
1722 struct si_query *query;
1723
1724 LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1725 query->ops->suspend(sctx, query);
1726 }
1727
si_resume_queries(struct si_context * sctx)1728 void si_resume_queries(struct si_context *sctx)
1729 {
1730 struct si_query *query;
1731
1732 /* Check CS space here. Resuming must not be interrupted by flushes. */
1733 si_need_gfx_cs_space(sctx, 0);
1734
1735 LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1736 query->ops->resume(sctx, query);
1737 }
1738
1739 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
1740 { \
1741 .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1742 .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_ \
1743 }
1744
1745 #define X(name_, query_type_, type_, result_type_) \
1746 XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1747
1748 #define XG(group_, name_, query_type_, type_, result_type_) \
1749 XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1750
1751 static struct pipe_driver_query_info si_driver_query_list[] = {
1752 X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1753 X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1754 X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1755 X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1756 X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
1757 X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1758 X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1759 X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1760 X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1761 X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1762 X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1763 X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1764 X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1765 X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1766 X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1767 X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1768 X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1769 X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1770 X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1771 X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1772 X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1773 X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1774 X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1775 X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1776 X("slab-wasted-VRAM", SLAB_WASTED_VRAM, BYTES, AVERAGE),
1777 X("slab-wasted-GTT", SLAB_WASTED_GTT, BYTES, AVERAGE),
1778 X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1779 X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1780 X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1781 X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1782 X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1783 X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1784 X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1785 X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1786 X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1787 X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1788 X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1789 X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1790 X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1791 X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1792 X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1793 X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1794 X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1795 X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1796
1797 /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1798 * which use it as a fallback path to detect the GPU type.
1799 *
1800 * Note: The names of these queries are significant for GPUPerfStudio
1801 * (and possibly their order as well). */
1802 XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1803 XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1804 XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1805 XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1806 XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1807
1808 X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1809 X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1810 X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1811
1812 /* The following queries must be at the end of the list because their
1813 * availability is adjusted dynamically based on the DRM version. */
1814 X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1815 X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1816 X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1817 X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1818 X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1819 X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1820 X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1821 X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1822 X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1823 X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1824 X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1825 X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1826 X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1827 X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1828
1829 /* SRBM_STATUS2 */
1830 X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1831
1832 /* CP_STAT */
1833 X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1834 X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1835 X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1836 X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1837 X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1838 X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1839 };
1840
1841 #undef X
1842 #undef XG
1843 #undef XFULL
1844
si_get_num_queries(struct si_screen * sscreen)1845 static unsigned si_get_num_queries(struct si_screen *sscreen)
1846 {
1847 /* amdgpu */
1848 if (sscreen->info.is_amdgpu) {
1849 if (sscreen->info.gfx_level >= GFX8)
1850 return ARRAY_SIZE(si_driver_query_list);
1851 else
1852 return ARRAY_SIZE(si_driver_query_list) - 7;
1853 }
1854
1855 /* radeon */
1856 if (sscreen->info.gfx_level == GFX7)
1857 return ARRAY_SIZE(si_driver_query_list) - 6;
1858 else
1859 return ARRAY_SIZE(si_driver_query_list) - 7;
1860
1861 return ARRAY_SIZE(si_driver_query_list) - 21;
1862 }
1863
si_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)1864 static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1865 struct pipe_driver_query_info *info)
1866 {
1867 struct si_screen *sscreen = (struct si_screen *)screen;
1868 unsigned num_queries = si_get_num_queries(sscreen);
1869
1870 if (!info) {
1871 unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1872
1873 return num_queries + num_perfcounters;
1874 }
1875
1876 if (index >= num_queries)
1877 return si_get_perfcounter_info(sscreen, index - num_queries, info);
1878
1879 *info = si_driver_query_list[index];
1880
1881 switch (info->query_type) {
1882 case SI_QUERY_REQUESTED_VRAM:
1883 case SI_QUERY_VRAM_USAGE:
1884 case SI_QUERY_MAPPED_VRAM:
1885 case SI_QUERY_SLAB_WASTED_VRAM:
1886 info->max_value.u64 = (uint64_t)sscreen->info.vram_size_kb * 1024;
1887 break;
1888 case SI_QUERY_REQUESTED_GTT:
1889 case SI_QUERY_GTT_USAGE:
1890 case SI_QUERY_MAPPED_GTT:
1891 case SI_QUERY_SLAB_WASTED_GTT:
1892 info->max_value.u64 = (uint64_t)sscreen->info.gart_size_kb * 1024;
1893 break;
1894 case SI_QUERY_GPU_TEMPERATURE:
1895 info->max_value.u64 = 125;
1896 break;
1897 case SI_QUERY_VRAM_VIS_USAGE:
1898 info->max_value.u64 = (uint64_t)sscreen->info.vram_vis_size_kb * 1024;
1899 break;
1900 }
1901
1902 if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1903 info->group_id += sscreen->perfcounters->base.num_groups;
1904
1905 return 1;
1906 }
1907
1908 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1909 * performance counter groups, so be careful when changing this and related
1910 * functions.
1911 */
si_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1912 static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1913 struct pipe_driver_query_group_info *info)
1914 {
1915 struct si_screen *sscreen = (struct si_screen *)screen;
1916 unsigned num_pc_groups = 0;
1917
1918 if (sscreen->perfcounters)
1919 num_pc_groups = sscreen->perfcounters->base.num_groups;
1920
1921 if (!info)
1922 return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1923
1924 if (index < num_pc_groups)
1925 return si_get_perfcounter_group_info(sscreen, index, info);
1926
1927 index -= num_pc_groups;
1928 if (index >= SI_NUM_SW_QUERY_GROUPS)
1929 return 0;
1930
1931 info->name = "GPIN";
1932 info->max_active_queries = 5;
1933 info->num_queries = 5;
1934 return 1;
1935 }
1936
si_init_query_functions(struct si_context * sctx)1937 void si_init_query_functions(struct si_context *sctx)
1938 {
1939 sctx->b.create_query = si_create_query;
1940 sctx->b.create_batch_query = si_create_batch_query;
1941 sctx->b.destroy_query = si_destroy_query;
1942 sctx->b.begin_query = si_begin_query;
1943 sctx->b.end_query = si_end_query;
1944 sctx->b.get_query_result = si_get_query_result;
1945 sctx->b.get_query_result_resource = si_get_query_result_resource;
1946
1947 if (sctx->has_graphics) {
1948 sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1949 sctx->b.render_condition = si_render_condition;
1950 }
1951
1952 list_inithead(&sctx->active_queries);
1953 }
1954
si_init_screen_query_functions(struct si_screen * sscreen)1955 void si_init_screen_query_functions(struct si_screen *sscreen)
1956 {
1957 sscreen->b.get_driver_query_info = si_get_driver_query_info;
1958 sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1959 }
1960