1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4 * Copyright 2018 Advanced Micro Devices, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "si_query.h"
28 #include "si_build_pm4.h"
29
30 #include "amd/common/sid.h"
31 #include "si_pipe.h"
32 #include "util/os_time.h"
33 #include "util/u_memory.h"
34 #include "util/u_suballoc.h"
35 #include "util/u_upload_mgr.h"
36
37 static const struct si_query_ops query_hw_ops;
38
39 struct si_hw_query_params {
40 unsigned start_offset;
41 unsigned end_offset;
42 unsigned fence_offset;
43 unsigned pair_stride;
44 unsigned pair_count;
45 };
46
47 /* Queries without buffer handling or suspend/resume. */
48 struct si_query_sw {
49 struct si_query b;
50
51 uint64_t begin_result;
52 uint64_t end_result;
53
54 uint64_t begin_time;
55 uint64_t end_time;
56
57 /* Fence for GPU_FINISHED. */
58 struct pipe_fence_handle *fence;
59 };
60
si_query_sw_destroy(struct si_context * sctx,struct si_query * squery)61 static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
62 {
63 struct si_query_sw *query = (struct si_query_sw *)squery;
64
65 sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
66 FREE(query);
67 }
68
winsys_id_from_type(unsigned type)69 static enum radeon_value_id winsys_id_from_type(unsigned type)
70 {
71 switch (type) {
72 case SI_QUERY_REQUESTED_VRAM:
73 return RADEON_REQUESTED_VRAM_MEMORY;
74 case SI_QUERY_REQUESTED_GTT:
75 return RADEON_REQUESTED_GTT_MEMORY;
76 case SI_QUERY_MAPPED_VRAM:
77 return RADEON_MAPPED_VRAM;
78 case SI_QUERY_MAPPED_GTT:
79 return RADEON_MAPPED_GTT;
80 case SI_QUERY_SLAB_WASTED_VRAM:
81 return RADEON_SLAB_WASTED_VRAM;
82 case SI_QUERY_SLAB_WASTED_GTT:
83 return RADEON_SLAB_WASTED_GTT;
84 case SI_QUERY_BUFFER_WAIT_TIME:
85 return RADEON_BUFFER_WAIT_TIME_NS;
86 case SI_QUERY_NUM_MAPPED_BUFFERS:
87 return RADEON_NUM_MAPPED_BUFFERS;
88 case SI_QUERY_NUM_GFX_IBS:
89 return RADEON_NUM_GFX_IBS;
90 case SI_QUERY_GFX_BO_LIST_SIZE:
91 return RADEON_GFX_BO_LIST_COUNTER;
92 case SI_QUERY_GFX_IB_SIZE:
93 return RADEON_GFX_IB_SIZE_COUNTER;
94 case SI_QUERY_NUM_BYTES_MOVED:
95 return RADEON_NUM_BYTES_MOVED;
96 case SI_QUERY_NUM_EVICTIONS:
97 return RADEON_NUM_EVICTIONS;
98 case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
99 return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
100 case SI_QUERY_VRAM_USAGE:
101 return RADEON_VRAM_USAGE;
102 case SI_QUERY_VRAM_VIS_USAGE:
103 return RADEON_VRAM_VIS_USAGE;
104 case SI_QUERY_GTT_USAGE:
105 return RADEON_GTT_USAGE;
106 case SI_QUERY_GPU_TEMPERATURE:
107 return RADEON_GPU_TEMPERATURE;
108 case SI_QUERY_CURRENT_GPU_SCLK:
109 return RADEON_CURRENT_SCLK;
110 case SI_QUERY_CURRENT_GPU_MCLK:
111 return RADEON_CURRENT_MCLK;
112 case SI_QUERY_CS_THREAD_BUSY:
113 return RADEON_CS_THREAD_TIME;
114 default:
115 unreachable("query type does not correspond to winsys id");
116 }
117 }
118
si_query_sw_begin(struct si_context * sctx,struct si_query * squery)119 static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
120 {
121 struct si_query_sw *query = (struct si_query_sw *)squery;
122 enum radeon_value_id ws_id;
123
124 switch (query->b.type) {
125 case PIPE_QUERY_TIMESTAMP_DISJOINT:
126 case PIPE_QUERY_GPU_FINISHED:
127 break;
128 case SI_QUERY_DRAW_CALLS:
129 query->begin_result = sctx->num_draw_calls;
130 break;
131 case SI_QUERY_DECOMPRESS_CALLS:
132 query->begin_result = sctx->num_decompress_calls;
133 break;
134 case SI_QUERY_PRIM_RESTART_CALLS:
135 query->begin_result = sctx->num_prim_restart_calls;
136 break;
137 case SI_QUERY_COMPUTE_CALLS:
138 query->begin_result = sctx->num_compute_calls;
139 break;
140 case SI_QUERY_CP_DMA_CALLS:
141 query->begin_result = sctx->num_cp_dma_calls;
142 break;
143 case SI_QUERY_NUM_VS_FLUSHES:
144 query->begin_result = sctx->num_vs_flushes;
145 break;
146 case SI_QUERY_NUM_PS_FLUSHES:
147 query->begin_result = sctx->num_ps_flushes;
148 break;
149 case SI_QUERY_NUM_CS_FLUSHES:
150 query->begin_result = sctx->num_cs_flushes;
151 break;
152 case SI_QUERY_NUM_CB_CACHE_FLUSHES:
153 query->begin_result = sctx->num_cb_cache_flushes;
154 break;
155 case SI_QUERY_NUM_DB_CACHE_FLUSHES:
156 query->begin_result = sctx->num_db_cache_flushes;
157 break;
158 case SI_QUERY_NUM_L2_INVALIDATES:
159 query->begin_result = sctx->num_L2_invalidates;
160 break;
161 case SI_QUERY_NUM_L2_WRITEBACKS:
162 query->begin_result = sctx->num_L2_writebacks;
163 break;
164 case SI_QUERY_NUM_RESIDENT_HANDLES:
165 query->begin_result = sctx->num_resident_handles;
166 break;
167 case SI_QUERY_TC_OFFLOADED_SLOTS:
168 query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
169 break;
170 case SI_QUERY_TC_DIRECT_SLOTS:
171 query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
172 break;
173 case SI_QUERY_TC_NUM_SYNCS:
174 query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
175 break;
176 case SI_QUERY_REQUESTED_VRAM:
177 case SI_QUERY_REQUESTED_GTT:
178 case SI_QUERY_MAPPED_VRAM:
179 case SI_QUERY_MAPPED_GTT:
180 case SI_QUERY_SLAB_WASTED_VRAM:
181 case SI_QUERY_SLAB_WASTED_GTT:
182 case SI_QUERY_VRAM_USAGE:
183 case SI_QUERY_VRAM_VIS_USAGE:
184 case SI_QUERY_GTT_USAGE:
185 case SI_QUERY_GPU_TEMPERATURE:
186 case SI_QUERY_CURRENT_GPU_SCLK:
187 case SI_QUERY_CURRENT_GPU_MCLK:
188 case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
189 case SI_QUERY_NUM_MAPPED_BUFFERS:
190 query->begin_result = 0;
191 break;
192 case SI_QUERY_BUFFER_WAIT_TIME:
193 case SI_QUERY_GFX_IB_SIZE:
194 case SI_QUERY_NUM_GFX_IBS:
195 case SI_QUERY_NUM_BYTES_MOVED:
196 case SI_QUERY_NUM_EVICTIONS:
197 case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
198 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
199 query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
200 break;
201 }
202 case SI_QUERY_GFX_BO_LIST_SIZE:
203 ws_id = winsys_id_from_type(query->b.type);
204 query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
205 query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
206 break;
207 case SI_QUERY_CS_THREAD_BUSY:
208 ws_id = winsys_id_from_type(query->b.type);
209 query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
210 query->begin_time = os_time_get_nano();
211 break;
212 case SI_QUERY_GALLIUM_THREAD_BUSY:
213 query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
214 query->begin_time = os_time_get_nano();
215 break;
216 case SI_QUERY_GPU_LOAD:
217 case SI_QUERY_GPU_SHADERS_BUSY:
218 case SI_QUERY_GPU_TA_BUSY:
219 case SI_QUERY_GPU_GDS_BUSY:
220 case SI_QUERY_GPU_VGT_BUSY:
221 case SI_QUERY_GPU_IA_BUSY:
222 case SI_QUERY_GPU_SX_BUSY:
223 case SI_QUERY_GPU_WD_BUSY:
224 case SI_QUERY_GPU_BCI_BUSY:
225 case SI_QUERY_GPU_SC_BUSY:
226 case SI_QUERY_GPU_PA_BUSY:
227 case SI_QUERY_GPU_DB_BUSY:
228 case SI_QUERY_GPU_CP_BUSY:
229 case SI_QUERY_GPU_CB_BUSY:
230 case SI_QUERY_GPU_SDMA_BUSY:
231 case SI_QUERY_GPU_PFP_BUSY:
232 case SI_QUERY_GPU_MEQ_BUSY:
233 case SI_QUERY_GPU_ME_BUSY:
234 case SI_QUERY_GPU_SURF_SYNC_BUSY:
235 case SI_QUERY_GPU_CP_DMA_BUSY:
236 case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
237 query->begin_result = si_begin_counter(sctx->screen, query->b.type);
238 break;
239 case SI_QUERY_NUM_COMPILATIONS:
240 query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
241 break;
242 case SI_QUERY_NUM_SHADERS_CREATED:
243 query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
244 break;
245 case SI_QUERY_LIVE_SHADER_CACHE_HITS:
246 query->begin_result = sctx->screen->live_shader_cache.hits;
247 break;
248 case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
249 query->begin_result = sctx->screen->live_shader_cache.misses;
250 break;
251 case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
252 query->begin_result = sctx->screen->num_memory_shader_cache_hits;
253 break;
254 case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
255 query->begin_result = sctx->screen->num_memory_shader_cache_misses;
256 break;
257 case SI_QUERY_DISK_SHADER_CACHE_HITS:
258 query->begin_result = sctx->screen->num_disk_shader_cache_hits;
259 break;
260 case SI_QUERY_DISK_SHADER_CACHE_MISSES:
261 query->begin_result = sctx->screen->num_disk_shader_cache_misses;
262 break;
263 case SI_QUERY_GPIN_ASIC_ID:
264 case SI_QUERY_GPIN_NUM_SIMD:
265 case SI_QUERY_GPIN_NUM_RB:
266 case SI_QUERY_GPIN_NUM_SPI:
267 case SI_QUERY_GPIN_NUM_SE:
268 break;
269 default:
270 unreachable("si_query_sw_begin: bad query type");
271 }
272
273 return true;
274 }
275
si_query_sw_end(struct si_context * sctx,struct si_query * squery)276 static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
277 {
278 struct si_query_sw *query = (struct si_query_sw *)squery;
279 enum radeon_value_id ws_id;
280
281 switch (query->b.type) {
282 case PIPE_QUERY_TIMESTAMP_DISJOINT:
283 break;
284 case PIPE_QUERY_GPU_FINISHED:
285 sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
286 break;
287 case SI_QUERY_DRAW_CALLS:
288 query->end_result = sctx->num_draw_calls;
289 break;
290 case SI_QUERY_DECOMPRESS_CALLS:
291 query->end_result = sctx->num_decompress_calls;
292 break;
293 case SI_QUERY_PRIM_RESTART_CALLS:
294 query->end_result = sctx->num_prim_restart_calls;
295 break;
296 case SI_QUERY_COMPUTE_CALLS:
297 query->end_result = sctx->num_compute_calls;
298 break;
299 case SI_QUERY_CP_DMA_CALLS:
300 query->end_result = sctx->num_cp_dma_calls;
301 break;
302 case SI_QUERY_NUM_VS_FLUSHES:
303 query->end_result = sctx->num_vs_flushes;
304 break;
305 case SI_QUERY_NUM_PS_FLUSHES:
306 query->end_result = sctx->num_ps_flushes;
307 break;
308 case SI_QUERY_NUM_CS_FLUSHES:
309 query->end_result = sctx->num_cs_flushes;
310 break;
311 case SI_QUERY_NUM_CB_CACHE_FLUSHES:
312 query->end_result = sctx->num_cb_cache_flushes;
313 break;
314 case SI_QUERY_NUM_DB_CACHE_FLUSHES:
315 query->end_result = sctx->num_db_cache_flushes;
316 break;
317 case SI_QUERY_NUM_L2_INVALIDATES:
318 query->end_result = sctx->num_L2_invalidates;
319 break;
320 case SI_QUERY_NUM_L2_WRITEBACKS:
321 query->end_result = sctx->num_L2_writebacks;
322 break;
323 case SI_QUERY_NUM_RESIDENT_HANDLES:
324 query->end_result = sctx->num_resident_handles;
325 break;
326 case SI_QUERY_TC_OFFLOADED_SLOTS:
327 query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
328 break;
329 case SI_QUERY_TC_DIRECT_SLOTS:
330 query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
331 break;
332 case SI_QUERY_TC_NUM_SYNCS:
333 query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
334 break;
335 case SI_QUERY_REQUESTED_VRAM:
336 case SI_QUERY_REQUESTED_GTT:
337 case SI_QUERY_MAPPED_VRAM:
338 case SI_QUERY_MAPPED_GTT:
339 case SI_QUERY_SLAB_WASTED_VRAM:
340 case SI_QUERY_SLAB_WASTED_GTT:
341 case SI_QUERY_VRAM_USAGE:
342 case SI_QUERY_VRAM_VIS_USAGE:
343 case SI_QUERY_GTT_USAGE:
344 case SI_QUERY_GPU_TEMPERATURE:
345 case SI_QUERY_CURRENT_GPU_SCLK:
346 case SI_QUERY_CURRENT_GPU_MCLK:
347 case SI_QUERY_BUFFER_WAIT_TIME:
348 case SI_QUERY_GFX_IB_SIZE:
349 case SI_QUERY_NUM_MAPPED_BUFFERS:
350 case SI_QUERY_NUM_GFX_IBS:
351 case SI_QUERY_NUM_BYTES_MOVED:
352 case SI_QUERY_NUM_EVICTIONS:
353 case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
354 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
355 query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
356 break;
357 }
358 case SI_QUERY_GFX_BO_LIST_SIZE:
359 ws_id = winsys_id_from_type(query->b.type);
360 query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
361 query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
362 break;
363 case SI_QUERY_CS_THREAD_BUSY:
364 ws_id = winsys_id_from_type(query->b.type);
365 query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
366 query->end_time = os_time_get_nano();
367 break;
368 case SI_QUERY_GALLIUM_THREAD_BUSY:
369 query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
370 query->end_time = os_time_get_nano();
371 break;
372 case SI_QUERY_GPU_LOAD:
373 case SI_QUERY_GPU_SHADERS_BUSY:
374 case SI_QUERY_GPU_TA_BUSY:
375 case SI_QUERY_GPU_GDS_BUSY:
376 case SI_QUERY_GPU_VGT_BUSY:
377 case SI_QUERY_GPU_IA_BUSY:
378 case SI_QUERY_GPU_SX_BUSY:
379 case SI_QUERY_GPU_WD_BUSY:
380 case SI_QUERY_GPU_BCI_BUSY:
381 case SI_QUERY_GPU_SC_BUSY:
382 case SI_QUERY_GPU_PA_BUSY:
383 case SI_QUERY_GPU_DB_BUSY:
384 case SI_QUERY_GPU_CP_BUSY:
385 case SI_QUERY_GPU_CB_BUSY:
386 case SI_QUERY_GPU_SDMA_BUSY:
387 case SI_QUERY_GPU_PFP_BUSY:
388 case SI_QUERY_GPU_MEQ_BUSY:
389 case SI_QUERY_GPU_ME_BUSY:
390 case SI_QUERY_GPU_SURF_SYNC_BUSY:
391 case SI_QUERY_GPU_CP_DMA_BUSY:
392 case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
393 query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
394 query->begin_result = 0;
395 break;
396 case SI_QUERY_NUM_COMPILATIONS:
397 query->end_result = p_atomic_read(&sctx->screen->num_compilations);
398 break;
399 case SI_QUERY_NUM_SHADERS_CREATED:
400 query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
401 break;
402 case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
403 query->end_result = sctx->last_tex_ps_draw_ratio;
404 break;
405 case SI_QUERY_LIVE_SHADER_CACHE_HITS:
406 query->end_result = sctx->screen->live_shader_cache.hits;
407 break;
408 case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
409 query->end_result = sctx->screen->live_shader_cache.misses;
410 break;
411 case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
412 query->end_result = sctx->screen->num_memory_shader_cache_hits;
413 break;
414 case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
415 query->end_result = sctx->screen->num_memory_shader_cache_misses;
416 break;
417 case SI_QUERY_DISK_SHADER_CACHE_HITS:
418 query->end_result = sctx->screen->num_disk_shader_cache_hits;
419 break;
420 case SI_QUERY_DISK_SHADER_CACHE_MISSES:
421 query->end_result = sctx->screen->num_disk_shader_cache_misses;
422 break;
423 case SI_QUERY_GPIN_ASIC_ID:
424 case SI_QUERY_GPIN_NUM_SIMD:
425 case SI_QUERY_GPIN_NUM_RB:
426 case SI_QUERY_GPIN_NUM_SPI:
427 case SI_QUERY_GPIN_NUM_SE:
428 break;
429 default:
430 unreachable("si_query_sw_end: bad query type");
431 }
432
433 return true;
434 }
435
si_query_sw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)436 static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
437 union pipe_query_result *result)
438 {
439 struct si_query_sw *query = (struct si_query_sw *)squery;
440
441 switch (query->b.type) {
442 case PIPE_QUERY_TIMESTAMP_DISJOINT:
443 /* Convert from cycles per millisecond to cycles per second (Hz). */
444 result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
445 result->timestamp_disjoint.disjoint = false;
446 return true;
447 case PIPE_QUERY_GPU_FINISHED: {
448 struct pipe_screen *screen = sctx->b.screen;
449 struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
450
451 result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
452 return result->b;
453 }
454
455 case SI_QUERY_GFX_BO_LIST_SIZE:
456 result->u64 =
457 (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
458 return true;
459 case SI_QUERY_CS_THREAD_BUSY:
460 case SI_QUERY_GALLIUM_THREAD_BUSY:
461 result->u64 =
462 (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
463 return true;
464 case SI_QUERY_GPIN_ASIC_ID:
465 result->u32 = 0;
466 return true;
467 case SI_QUERY_GPIN_NUM_SIMD:
468 result->u32 = sctx->screen->info.num_good_compute_units;
469 return true;
470 case SI_QUERY_GPIN_NUM_RB:
471 result->u32 = sctx->screen->info.max_render_backends;
472 return true;
473 case SI_QUERY_GPIN_NUM_SPI:
474 result->u32 = 1; /* all supported chips have one SPI per SE */
475 return true;
476 case SI_QUERY_GPIN_NUM_SE:
477 result->u32 = sctx->screen->info.max_se;
478 return true;
479 }
480
481 result->u64 = query->end_result - query->begin_result;
482
483 switch (query->b.type) {
484 case SI_QUERY_BUFFER_WAIT_TIME:
485 case SI_QUERY_GPU_TEMPERATURE:
486 result->u64 /= 1000;
487 break;
488 case SI_QUERY_CURRENT_GPU_SCLK:
489 case SI_QUERY_CURRENT_GPU_MCLK:
490 result->u64 *= 1000000;
491 break;
492 }
493
494 return true;
495 }
496
497 static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
498 .begin = si_query_sw_begin,
499 .end = si_query_sw_end,
500 .get_result = si_query_sw_get_result,
501 .get_result_resource = NULL};
502
si_query_sw_create(unsigned query_type)503 static struct pipe_query *si_query_sw_create(unsigned query_type)
504 {
505 struct si_query_sw *query;
506
507 query = CALLOC_STRUCT(si_query_sw);
508 if (!query)
509 return NULL;
510
511 query->b.type = query_type;
512 query->b.ops = &sw_query_ops;
513
514 return (struct pipe_query *)query;
515 }
516
si_query_buffer_destroy(struct si_screen * sscreen,struct si_query_buffer * buffer)517 void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
518 {
519 struct si_query_buffer *prev = buffer->previous;
520
521 /* Release all query buffers. */
522 while (prev) {
523 struct si_query_buffer *qbuf = prev;
524 prev = prev->previous;
525 si_resource_reference(&qbuf->buf, NULL);
526 FREE(qbuf);
527 }
528
529 si_resource_reference(&buffer->buf, NULL);
530 }
531
si_query_buffer_reset(struct si_context * sctx,struct si_query_buffer * buffer)532 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
533 {
534 /* Discard all query buffers except for the oldest. */
535 while (buffer->previous) {
536 struct si_query_buffer *qbuf = buffer->previous;
537 buffer->previous = qbuf->previous;
538
539 si_resource_reference(&buffer->buf, NULL);
540 buffer->buf = qbuf->buf; /* move ownership */
541 FREE(qbuf);
542 }
543 buffer->results_end = 0;
544
545 if (!buffer->buf)
546 return;
547
548 /* Discard even the oldest buffer if it can't be mapped without a stall. */
549 if (si_cs_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
550 !sctx->ws->buffer_wait(sctx->ws, buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
551 si_resource_reference(&buffer->buf, NULL);
552 } else {
553 buffer->unprepared = true;
554 }
555 }
556
si_query_buffer_alloc(struct si_context * sctx,struct si_query_buffer * buffer,bool (* prepare_buffer)(struct si_context *,struct si_query_buffer *),unsigned size)557 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
558 bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
559 unsigned size)
560 {
561 bool unprepared = buffer->unprepared;
562 buffer->unprepared = false;
563
564 if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
565 if (buffer->buf) {
566 struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
567 memcpy(qbuf, buffer, sizeof(*qbuf));
568 buffer->previous = qbuf;
569 }
570 buffer->results_end = 0;
571
572 /* Queries are normally read by the CPU after
573 * being written by the gpu, hence staging is probably a good
574 * usage pattern.
575 */
576 struct si_screen *screen = sctx->screen;
577 unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
578 buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
579 if (unlikely(!buffer->buf))
580 return false;
581 unprepared = true;
582 }
583
584 if (unprepared && prepare_buffer) {
585 if (unlikely(!prepare_buffer(sctx, buffer))) {
586 si_resource_reference(&buffer->buf, NULL);
587 return false;
588 }
589 }
590
591 return true;
592 }
593
si_query_hw_destroy(struct si_context * sctx,struct si_query * squery)594 void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
595 {
596 struct si_query_hw *query = (struct si_query_hw *)squery;
597
598 si_query_buffer_destroy(sctx->screen, &query->buffer);
599 si_resource_reference(&query->workaround_buf, NULL);
600 FREE(squery);
601 }
602
si_query_hw_prepare_buffer(struct si_context * sctx,struct si_query_buffer * qbuf)603 static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
604 {
605 struct si_query_hw *query = container_of(qbuf, struct si_query_hw, buffer);
606 struct si_screen *screen = sctx->screen;
607
608 /* The caller ensures that the buffer is currently unused by the GPU. */
609 uint32_t *results = screen->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
610 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
611 if (!results)
612 return false;
613
614 memset(results, 0, qbuf->buf->b.b.width0);
615
616 if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
617 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
618 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
619 unsigned max_rbs = screen->info.max_render_backends;
620 unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
621 unsigned num_results;
622 unsigned i, j;
623
624 /* Set top bits for unused backends. */
625 num_results = qbuf->buf->b.b.width0 / query->result_size;
626 for (j = 0; j < num_results; j++) {
627 for (i = 0; i < max_rbs; i++) {
628 if (!(enabled_rb_mask & (1 << i))) {
629 results[(i * 4) + 1] = 0x80000000;
630 results[(i * 4) + 3] = 0x80000000;
631 }
632 }
633 results += 4 * max_rbs;
634 }
635 }
636
637 return true;
638 }
639
640 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
641 bool wait, enum pipe_query_value_type result_type,
642 int index, struct pipe_resource *resource,
643 unsigned offset);
644
645 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
646 struct si_resource *buffer, uint64_t va);
647 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
648 struct si_resource *buffer, uint64_t va);
649 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
650 union pipe_query_result *result);
651 static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
652
653 static struct si_query_hw_ops query_hw_default_hw_ops = {
654 .prepare_buffer = si_query_hw_prepare_buffer,
655 .emit_start = si_query_hw_do_emit_start,
656 .emit_stop = si_query_hw_do_emit_stop,
657 .clear_result = si_query_hw_clear_result,
658 .add_result = si_query_hw_add_result,
659 };
660
si_query_hw_create(struct si_screen * sscreen,unsigned query_type,unsigned index)661 static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
662 unsigned index)
663 {
664 struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
665 if (!query)
666 return NULL;
667
668 query->b.type = query_type;
669 query->b.ops = &query_hw_ops;
670 query->ops = &query_hw_default_hw_ops;
671
672 switch (query_type) {
673 case PIPE_QUERY_OCCLUSION_COUNTER:
674 case PIPE_QUERY_OCCLUSION_PREDICATE:
675 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
676 query->result_size = 16 * sscreen->info.max_render_backends;
677 query->result_size += 16; /* for the fence + alignment */
678 query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
679 break;
680 case PIPE_QUERY_TIME_ELAPSED:
681 query->result_size = 24;
682 query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
683 break;
684 case PIPE_QUERY_TIMESTAMP:
685 query->result_size = 16;
686 query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
687 query->flags = SI_QUERY_HW_FLAG_NO_START;
688 break;
689 case PIPE_QUERY_PRIMITIVES_EMITTED:
690 case PIPE_QUERY_PRIMITIVES_GENERATED:
691 case PIPE_QUERY_SO_STATISTICS:
692 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
693 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
694 query->result_size = 32;
695 query->b.num_cs_dw_suspend = 6;
696 query->stream = index;
697 break;
698 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
699 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
700 query->result_size = 32 * SI_MAX_STREAMS;
701 query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
702 break;
703 case PIPE_QUERY_PIPELINE_STATISTICS:
704 /* 11 values on GCN. */
705 query->result_size = 11 * 16;
706 query->result_size += 8; /* for the fence + alignment */
707 query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
708 break;
709 default:
710 assert(0);
711 FREE(query);
712 return NULL;
713 }
714
715 return (struct pipe_query *)query;
716 }
717
si_update_occlusion_query_state(struct si_context * sctx,unsigned type,int diff)718 static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
719 {
720 if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
721 type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
722 bool old_enable = sctx->num_occlusion_queries != 0;
723 bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
724 bool enable, perfect_enable;
725
726 sctx->num_occlusion_queries += diff;
727 assert(sctx->num_occlusion_queries >= 0);
728
729 if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
730 sctx->num_perfect_occlusion_queries += diff;
731 assert(sctx->num_perfect_occlusion_queries >= 0);
732 }
733
734 enable = sctx->num_occlusion_queries != 0;
735 perfect_enable = sctx->num_perfect_occlusion_queries != 0;
736
737 if (enable != old_enable || perfect_enable != old_perfect_enable) {
738 si_set_occlusion_query_state(sctx, old_perfect_enable);
739 }
740 }
741 }
742
event_type_for_stream(unsigned stream)743 static unsigned event_type_for_stream(unsigned stream)
744 {
745 switch (stream) {
746 default:
747 case 0:
748 return V_028A90_SAMPLE_STREAMOUTSTATS;
749 case 1:
750 return V_028A90_SAMPLE_STREAMOUTSTATS1;
751 case 2:
752 return V_028A90_SAMPLE_STREAMOUTSTATS2;
753 case 3:
754 return V_028A90_SAMPLE_STREAMOUTSTATS3;
755 }
756 }
757
emit_sample_streamout(struct radeon_cmdbuf * cs,uint64_t va,unsigned stream)758 static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
759 {
760 radeon_begin(cs);
761 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
762 radeon_emit(EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
763 radeon_emit(va);
764 radeon_emit(va >> 32);
765 radeon_end();
766 }
767
si_query_hw_do_emit_start(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)768 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
769 struct si_resource *buffer, uint64_t va)
770 {
771 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
772
773 switch (query->b.type) {
774 case PIPE_QUERY_OCCLUSION_COUNTER:
775 case PIPE_QUERY_OCCLUSION_PREDICATE:
776 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
777 radeon_begin(cs);
778 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
779 radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
780 radeon_emit(va);
781 radeon_emit(va >> 32);
782 radeon_end();
783 break;
784 }
785 case PIPE_QUERY_PRIMITIVES_EMITTED:
786 case PIPE_QUERY_PRIMITIVES_GENERATED:
787 case PIPE_QUERY_SO_STATISTICS:
788 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
789 emit_sample_streamout(cs, va, query->stream);
790 break;
791 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
792 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
793 emit_sample_streamout(cs, va + 32 * stream, stream);
794 break;
795 case PIPE_QUERY_TIME_ELAPSED:
796 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
797 EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
798 break;
799 case PIPE_QUERY_PIPELINE_STATISTICS: {
800 radeon_begin(cs);
801 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
802 radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
803 radeon_emit(va);
804 radeon_emit(va >> 32);
805 radeon_end();
806 break;
807 }
808 default:
809 assert(0);
810 }
811 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
812 RADEON_PRIO_QUERY);
813 }
814
si_query_hw_emit_start(struct si_context * sctx,struct si_query_hw * query)815 static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
816 {
817 uint64_t va;
818
819 if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
820 return;
821
822 si_update_occlusion_query_state(sctx, query->b.type, 1);
823 si_update_prims_generated_query_state(sctx, query->b.type, 1);
824
825 if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
826 sctx->num_pipeline_stat_queries++;
827
828 si_need_gfx_cs_space(sctx, 0);
829
830 va = query->buffer.buf->gpu_address + query->buffer.results_end;
831 query->ops->emit_start(sctx, query, query->buffer.buf, va);
832 }
833
si_query_hw_do_emit_stop(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)834 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
835 struct si_resource *buffer, uint64_t va)
836 {
837 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
838 uint64_t fence_va = 0;
839
840 switch (query->b.type) {
841 case PIPE_QUERY_OCCLUSION_COUNTER:
842 case PIPE_QUERY_OCCLUSION_PREDICATE:
843 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
844 va += 8;
845 radeon_begin(cs);
846 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
847 radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
848 radeon_emit(va);
849 radeon_emit(va >> 32);
850 radeon_end();
851
852 fence_va = va + sctx->screen->info.max_render_backends * 16 - 8;
853 break;
854 }
855 case PIPE_QUERY_PRIMITIVES_EMITTED:
856 case PIPE_QUERY_PRIMITIVES_GENERATED:
857 case PIPE_QUERY_SO_STATISTICS:
858 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
859 va += 16;
860 emit_sample_streamout(cs, va, query->stream);
861 break;
862 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
863 va += 16;
864 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
865 emit_sample_streamout(cs, va + 32 * stream, stream);
866 break;
867 case PIPE_QUERY_TIME_ELAPSED:
868 va += 8;
869 FALLTHROUGH;
870 case PIPE_QUERY_TIMESTAMP:
871 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
872 EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
873 fence_va = va + 8;
874 break;
875 case PIPE_QUERY_PIPELINE_STATISTICS: {
876 unsigned sample_size = (query->result_size - 8) / 2;
877
878 va += sample_size;
879 radeon_begin(cs);
880 radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
881 radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
882 radeon_emit(va);
883 radeon_emit(va >> 32);
884 radeon_end();
885
886 fence_va = va + sample_size;
887 break;
888 }
889 default:
890 assert(0);
891 }
892 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
893 RADEON_PRIO_QUERY);
894
895 if (fence_va) {
896 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
897 EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
898 query->b.type);
899 }
900 }
901
si_query_hw_emit_stop(struct si_context * sctx,struct si_query_hw * query)902 static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
903 {
904 uint64_t va;
905
906 /* The queries which need begin already called this in begin_query. */
907 if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
908 si_need_gfx_cs_space(sctx, 0);
909 if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
910 query->result_size))
911 return;
912 }
913
914 if (!query->buffer.buf)
915 return; // previous buffer allocation failure
916
917 /* emit end query */
918 va = query->buffer.buf->gpu_address + query->buffer.results_end;
919
920 query->ops->emit_stop(sctx, query, query->buffer.buf, va);
921
922 query->buffer.results_end += query->result_size;
923
924 si_update_occlusion_query_state(sctx, query->b.type, -1);
925 si_update_prims_generated_query_state(sctx, query->b.type, -1);
926
927 if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
928 sctx->num_pipeline_stat_queries--;
929 }
930
emit_set_predicate(struct si_context * ctx,struct si_resource * buf,uint64_t va,uint32_t op)931 static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
932 uint32_t op)
933 {
934 struct radeon_cmdbuf *cs = &ctx->gfx_cs;
935
936 radeon_begin(cs);
937
938 if (ctx->chip_class >= GFX9) {
939 radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
940 radeon_emit(op);
941 radeon_emit(va);
942 radeon_emit(va >> 32);
943 } else {
944 radeon_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
945 radeon_emit(va);
946 radeon_emit(op | ((va >> 32) & 0xFF));
947 }
948 radeon_end();
949
950 radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY);
951 }
952
si_emit_query_predication(struct si_context * ctx)953 static void si_emit_query_predication(struct si_context *ctx)
954 {
955 uint32_t op;
956 bool flag_wait, invert;
957
958 struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
959 if (!query)
960 return;
961
962 invert = ctx->render_cond_invert;
963 flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
964 ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
965
966 if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
967 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
968 struct gfx10_sh_query *gfx10_query = (struct gfx10_sh_query *)query;
969 struct gfx10_sh_query_buffer *qbuf, *first, *last;
970
971 op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
972
973 /* if true then invert, see GL_ARB_conditional_render_inverted */
974 if (!invert)
975 op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
976 else
977 op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
978
979 op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
980
981 first = gfx10_query->first;
982 last = gfx10_query->last;
983
984 while (first) {
985 qbuf = first;
986 if (first != last)
987 first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
988 else
989 first = NULL;
990
991 unsigned results_base = gfx10_query->first_begin;
992 uint64_t va_base = qbuf->buf->gpu_address;
993 uint64_t va = va_base + results_base;
994
995 unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
996 unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
997
998 unsigned count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
999 do {
1000 if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1001 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1002 emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
1003
1004 /* set CONTINUE bit for all packets except the first */
1005 op |= PREDICATION_CONTINUE;
1006 }
1007 } else {
1008 emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
1009 op |= PREDICATION_CONTINUE;
1010 }
1011
1012 results_base += sizeof(struct gfx10_sh_query_buffer_mem);
1013 } while (count--);
1014 }
1015 } else {
1016 struct si_query_buffer *qbuf;
1017
1018 if (query->workaround_buf) {
1019 op = PRED_OP(PREDICATION_OP_BOOL64);
1020 } else {
1021 switch (query->b.type) {
1022 case PIPE_QUERY_OCCLUSION_COUNTER:
1023 case PIPE_QUERY_OCCLUSION_PREDICATE:
1024 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1025 op = PRED_OP(PREDICATION_OP_ZPASS);
1026 break;
1027 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1028 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1029 op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1030 invert = !invert;
1031 break;
1032 default:
1033 assert(0);
1034 return;
1035 }
1036 }
1037
1038 /* if true then invert, see GL_ARB_conditional_render_inverted */
1039 if (invert)
1040 op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1041 else
1042 op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1043
1044 /* Use the value written by compute shader as a workaround. Note that
1045 * the wait flag does not apply in this predication mode.
1046 *
1047 * The shader outputs the result value to L2. Workarounds only affect GFX8
1048 * and later, where the CP reads data from L2, so we don't need an
1049 * additional flush.
1050 */
1051 if (query->workaround_buf) {
1052 uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1053 emit_set_predicate(ctx, query->workaround_buf, va, op);
1054 return;
1055 }
1056
1057 op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1058
1059 /* emit predicate packets for all data blocks */
1060 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1061 unsigned results_base = 0;
1062 uint64_t va_base = qbuf->buf->gpu_address;
1063
1064 while (results_base < qbuf->results_end) {
1065 uint64_t va = va_base + results_base;
1066
1067 if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1068 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1069 emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1070
1071 /* set CONTINUE bit for all packets except the first */
1072 op |= PREDICATION_CONTINUE;
1073 }
1074 } else {
1075 emit_set_predicate(ctx, qbuf->buf, va, op);
1076 op |= PREDICATION_CONTINUE;
1077 }
1078
1079 results_base += query->result_size;
1080 }
1081 }
1082 }
1083 }
1084
si_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)1085 static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1086 unsigned index)
1087 {
1088 struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1089
1090 if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1091 (query_type >= PIPE_QUERY_DRIVER_SPECIFIC))
1092 return si_query_sw_create(query_type);
1093
1094 if (sscreen->use_ngg_streamout &&
1095 (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1096 query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1097 query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1098 query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1099 return gfx10_sh_query_create(sscreen, query_type, index);
1100
1101 return si_query_hw_create(sscreen, query_type, index);
1102 }
1103
si_destroy_query(struct pipe_context * ctx,struct pipe_query * query)1104 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1105 {
1106 struct si_context *sctx = (struct si_context *)ctx;
1107 struct si_query *squery = (struct si_query *)query;
1108
1109 squery->ops->destroy(sctx, squery);
1110 }
1111
si_begin_query(struct pipe_context * ctx,struct pipe_query * query)1112 static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1113 {
1114 struct si_context *sctx = (struct si_context *)ctx;
1115 struct si_query *squery = (struct si_query *)query;
1116
1117 return squery->ops->begin(sctx, squery);
1118 }
1119
si_query_hw_begin(struct si_context * sctx,struct si_query * squery)1120 bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1121 {
1122 struct si_query_hw *query = (struct si_query_hw *)squery;
1123
1124 if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1125 assert(0);
1126 return false;
1127 }
1128
1129 if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1130 si_query_buffer_reset(sctx, &query->buffer);
1131
1132 si_resource_reference(&query->workaround_buf, NULL);
1133
1134 si_query_hw_emit_start(sctx, query);
1135 if (!query->buffer.buf)
1136 return false;
1137
1138 list_addtail(&query->b.active_list, &sctx->active_queries);
1139 sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1140 return true;
1141 }
1142
si_end_query(struct pipe_context * ctx,struct pipe_query * query)1143 static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1144 {
1145 struct si_context *sctx = (struct si_context *)ctx;
1146 struct si_query *squery = (struct si_query *)query;
1147
1148 return squery->ops->end(sctx, squery);
1149 }
1150
si_query_hw_end(struct si_context * sctx,struct si_query * squery)1151 bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1152 {
1153 struct si_query_hw *query = (struct si_query_hw *)squery;
1154
1155 if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1156 si_query_buffer_reset(sctx, &query->buffer);
1157
1158 si_query_hw_emit_stop(sctx, query);
1159
1160 if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1161 list_delinit(&query->b.active_list);
1162 sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1163 }
1164
1165 if (!query->buffer.buf)
1166 return false;
1167
1168 return true;
1169 }
1170
si_get_hw_query_params(struct si_context * sctx,struct si_query_hw * squery,int index,struct si_hw_query_params * params)1171 static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
1172 struct si_hw_query_params *params)
1173 {
1174 unsigned max_rbs = sctx->screen->info.max_render_backends;
1175
1176 params->pair_stride = 0;
1177 params->pair_count = 1;
1178
1179 switch (squery->b.type) {
1180 case PIPE_QUERY_OCCLUSION_COUNTER:
1181 case PIPE_QUERY_OCCLUSION_PREDICATE:
1182 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1183 params->start_offset = 0;
1184 params->end_offset = 8;
1185 params->fence_offset = max_rbs * 16;
1186 params->pair_stride = 16;
1187 params->pair_count = max_rbs;
1188 break;
1189 case PIPE_QUERY_TIME_ELAPSED:
1190 params->start_offset = 0;
1191 params->end_offset = 8;
1192 params->fence_offset = 16;
1193 break;
1194 case PIPE_QUERY_TIMESTAMP:
1195 params->start_offset = 0;
1196 params->end_offset = 0;
1197 params->fence_offset = 8;
1198 break;
1199 case PIPE_QUERY_PRIMITIVES_EMITTED:
1200 params->start_offset = 8;
1201 params->end_offset = 24;
1202 params->fence_offset = params->end_offset + 4;
1203 break;
1204 case PIPE_QUERY_PRIMITIVES_GENERATED:
1205 params->start_offset = 0;
1206 params->end_offset = 16;
1207 params->fence_offset = params->end_offset + 4;
1208 break;
1209 case PIPE_QUERY_SO_STATISTICS:
1210 params->start_offset = 8 - index * 8;
1211 params->end_offset = 24 - index * 8;
1212 params->fence_offset = params->end_offset + 4;
1213 break;
1214 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1215 params->pair_count = SI_MAX_STREAMS;
1216 params->pair_stride = 32;
1217 FALLTHROUGH;
1218 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1219 params->start_offset = 0;
1220 params->end_offset = 16;
1221
1222 /* We can re-use the high dword of the last 64-bit value as a
1223 * fence: it is initialized as 0, and the high bit is set by
1224 * the write of the streamout stats event.
1225 */
1226 params->fence_offset = squery->result_size - 4;
1227 break;
1228 case PIPE_QUERY_PIPELINE_STATISTICS: {
1229 static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
1230 params->start_offset = offsets[index];
1231 params->end_offset = 88 + offsets[index];
1232 params->fence_offset = 2 * 88;
1233 break;
1234 }
1235 default:
1236 unreachable("si_get_hw_query_params unsupported");
1237 }
1238 }
1239
si_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1240 static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1241 bool test_status_bit)
1242 {
1243 uint32_t *current_result = (uint32_t *)map;
1244 uint64_t start, end;
1245
1246 start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1247 end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1248
1249 if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1250 return end - start;
1251 }
1252 return 0;
1253 }
1254
si_query_hw_add_result(struct si_screen * sscreen,struct si_query_hw * query,void * buffer,union pipe_query_result * result)1255 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1256 void *buffer, union pipe_query_result *result)
1257 {
1258 unsigned max_rbs = sscreen->info.max_render_backends;
1259
1260 switch (query->b.type) {
1261 case PIPE_QUERY_OCCLUSION_COUNTER: {
1262 for (unsigned i = 0; i < max_rbs; ++i) {
1263 unsigned results_base = i * 16;
1264 result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1265 }
1266 break;
1267 }
1268 case PIPE_QUERY_OCCLUSION_PREDICATE:
1269 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1270 for (unsigned i = 0; i < max_rbs; ++i) {
1271 unsigned results_base = i * 16;
1272 result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1273 }
1274 break;
1275 }
1276 case PIPE_QUERY_TIME_ELAPSED:
1277 result->u64 += si_query_read_result(buffer, 0, 2, false);
1278 break;
1279 case PIPE_QUERY_TIMESTAMP:
1280 result->u64 = *(uint64_t *)buffer;
1281 break;
1282 case PIPE_QUERY_PRIMITIVES_EMITTED:
1283 /* SAMPLE_STREAMOUTSTATS stores this structure:
1284 * {
1285 * u64 NumPrimitivesWritten;
1286 * u64 PrimitiveStorageNeeded;
1287 * }
1288 * We only need NumPrimitivesWritten here. */
1289 result->u64 += si_query_read_result(buffer, 2, 6, true);
1290 break;
1291 case PIPE_QUERY_PRIMITIVES_GENERATED:
1292 /* Here we read PrimitiveStorageNeeded. */
1293 result->u64 += si_query_read_result(buffer, 0, 4, true);
1294 break;
1295 case PIPE_QUERY_SO_STATISTICS:
1296 result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1297 result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1298 break;
1299 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1300 result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1301 si_query_read_result(buffer, 0, 4, true);
1302 break;
1303 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1304 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1305 result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1306 si_query_read_result(buffer, 0, 4, true);
1307 buffer = (char *)buffer + 32;
1308 }
1309 break;
1310 case PIPE_QUERY_PIPELINE_STATISTICS:
1311 result->pipeline_statistics.ps_invocations += si_query_read_result(buffer, 0, 22, false);
1312 result->pipeline_statistics.c_primitives += si_query_read_result(buffer, 2, 24, false);
1313 result->pipeline_statistics.c_invocations += si_query_read_result(buffer, 4, 26, false);
1314 result->pipeline_statistics.vs_invocations += si_query_read_result(buffer, 6, 28, false);
1315 result->pipeline_statistics.gs_invocations += si_query_read_result(buffer, 8, 30, false);
1316 result->pipeline_statistics.gs_primitives += si_query_read_result(buffer, 10, 32, false);
1317 result->pipeline_statistics.ia_primitives += si_query_read_result(buffer, 12, 34, false);
1318 result->pipeline_statistics.ia_vertices += si_query_read_result(buffer, 14, 36, false);
1319 result->pipeline_statistics.hs_invocations += si_query_read_result(buffer, 16, 38, false);
1320 result->pipeline_statistics.ds_invocations += si_query_read_result(buffer, 18, 40, false);
1321 result->pipeline_statistics.cs_invocations += si_query_read_result(buffer, 20, 42, false);
1322 #if 0 /* for testing */
1323 printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1324 "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1325 "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1326 result->pipeline_statistics.ia_vertices,
1327 result->pipeline_statistics.ia_primitives,
1328 result->pipeline_statistics.vs_invocations,
1329 result->pipeline_statistics.hs_invocations,
1330 result->pipeline_statistics.ds_invocations,
1331 result->pipeline_statistics.gs_invocations,
1332 result->pipeline_statistics.gs_primitives,
1333 result->pipeline_statistics.c_invocations,
1334 result->pipeline_statistics.c_primitives,
1335 result->pipeline_statistics.ps_invocations,
1336 result->pipeline_statistics.cs_invocations);
1337 #endif
1338 break;
1339 default:
1340 assert(0);
1341 }
1342 }
1343
si_query_hw_suspend(struct si_context * sctx,struct si_query * query)1344 void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1345 {
1346 si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1347 }
1348
si_query_hw_resume(struct si_context * sctx,struct si_query * query)1349 void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1350 {
1351 si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1352 }
1353
1354 static const struct si_query_ops query_hw_ops = {
1355 .destroy = si_query_hw_destroy,
1356 .begin = si_query_hw_begin,
1357 .end = si_query_hw_end,
1358 .get_result = si_query_hw_get_result,
1359 .get_result_resource = si_query_hw_get_result_resource,
1360
1361 .suspend = si_query_hw_suspend,
1362 .resume = si_query_hw_resume,
1363 };
1364
si_get_query_result(struct pipe_context * ctx,struct pipe_query * query,bool wait,union pipe_query_result * result)1365 static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1366 union pipe_query_result *result)
1367 {
1368 struct si_context *sctx = (struct si_context *)ctx;
1369 struct si_query *squery = (struct si_query *)query;
1370
1371 return squery->ops->get_result(sctx, squery, wait, result);
1372 }
1373
si_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1374 static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1375 bool wait, enum pipe_query_value_type result_type,
1376 int index, struct pipe_resource *resource, unsigned offset)
1377 {
1378 struct si_context *sctx = (struct si_context *)ctx;
1379 struct si_query *squery = (struct si_query *)query;
1380
1381 squery->ops->get_result_resource(sctx, squery, wait, result_type, index, resource, offset);
1382 }
1383
si_query_hw_clear_result(struct si_query_hw * query,union pipe_query_result * result)1384 static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1385 {
1386 util_query_clear_result(result, query->b.type);
1387 }
1388
si_query_hw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1389 bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1390 union pipe_query_result *result)
1391 {
1392 struct si_screen *sscreen = sctx->screen;
1393 struct si_query_hw *query = (struct si_query_hw *)squery;
1394 struct si_query_buffer *qbuf;
1395
1396 query->ops->clear_result(query, result);
1397
1398 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1399 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1400 unsigned results_base = 0;
1401 void *map;
1402
1403 if (squery->b.flushed)
1404 map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
1405 else
1406 map = si_buffer_map(sctx, qbuf->buf, usage);
1407
1408 if (!map)
1409 return false;
1410
1411 while (results_base != qbuf->results_end) {
1412 query->ops->add_result(sscreen, query, map + results_base, result);
1413 results_base += query->result_size;
1414 }
1415 }
1416
1417 /* Convert the time to expected units. */
1418 if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
1419 squery->type == PIPE_QUERY_TIMESTAMP) {
1420 result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1421 }
1422 return true;
1423 }
1424
si_query_hw_get_result_resource(struct si_context * sctx,struct si_query * squery,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1425 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1426 bool wait, enum pipe_query_value_type result_type,
1427 int index, struct pipe_resource *resource,
1428 unsigned offset)
1429 {
1430 struct si_query_hw *query = (struct si_query_hw *)squery;
1431 struct si_query_buffer *qbuf;
1432 struct si_query_buffer *qbuf_prev;
1433 struct pipe_resource *tmp_buffer = NULL;
1434 unsigned tmp_buffer_offset = 0;
1435 struct si_qbo_state saved_state = {};
1436 struct pipe_grid_info grid = {};
1437 struct pipe_constant_buffer constant_buffer = {};
1438 struct pipe_shader_buffer ssbo[3];
1439 struct si_hw_query_params params;
1440 struct {
1441 uint32_t end_offset;
1442 uint32_t result_stride;
1443 uint32_t result_count;
1444 uint32_t config;
1445 uint32_t fence_offset;
1446 uint32_t pair_stride;
1447 uint32_t pair_count;
1448 } consts;
1449
1450 if (!sctx->query_result_shader) {
1451 sctx->query_result_shader = si_create_query_result_cs(sctx);
1452 if (!sctx->query_result_shader)
1453 return;
1454 }
1455
1456 if (query->buffer.previous) {
1457 u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1458 if (!tmp_buffer)
1459 return;
1460 }
1461
1462 si_save_qbo_state(sctx, &saved_state);
1463
1464 si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, ¶ms);
1465 consts.end_offset = params.end_offset - params.start_offset;
1466 consts.fence_offset = params.fence_offset - params.start_offset;
1467 consts.result_stride = query->result_size;
1468 consts.pair_stride = params.pair_stride;
1469 consts.pair_count = params.pair_count;
1470
1471 constant_buffer.buffer_size = sizeof(consts);
1472 constant_buffer.user_buffer = &consts;
1473
1474 ssbo[1].buffer = tmp_buffer;
1475 ssbo[1].buffer_offset = tmp_buffer_offset;
1476 ssbo[1].buffer_size = 16;
1477
1478 ssbo[2] = ssbo[1];
1479
1480 grid.block[0] = 1;
1481 grid.block[1] = 1;
1482 grid.block[2] = 1;
1483 grid.grid[0] = 1;
1484 grid.grid[1] = 1;
1485 grid.grid[2] = 1;
1486
1487 consts.config = 0;
1488 if (index < 0)
1489 consts.config |= 4;
1490 if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1491 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1492 consts.config |= 8;
1493 else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1494 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1495 consts.config |= 8 | 256;
1496 else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1497 consts.config |= 32;
1498
1499 switch (result_type) {
1500 case PIPE_QUERY_TYPE_U64:
1501 case PIPE_QUERY_TYPE_I64:
1502 consts.config |= 64;
1503 break;
1504 case PIPE_QUERY_TYPE_I32:
1505 consts.config |= 128;
1506 break;
1507 case PIPE_QUERY_TYPE_U32:
1508 break;
1509 }
1510
1511 sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
1512
1513 for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1514 if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1515 qbuf_prev = qbuf->previous;
1516 consts.result_count = qbuf->results_end / query->result_size;
1517 consts.config &= ~3;
1518 if (qbuf != &query->buffer)
1519 consts.config |= 1;
1520 if (qbuf->previous)
1521 consts.config |= 2;
1522 } else {
1523 /* Only read the last timestamp. */
1524 qbuf_prev = NULL;
1525 consts.result_count = 0;
1526 consts.config |= 16;
1527 params.start_offset += qbuf->results_end - query->result_size;
1528 }
1529
1530 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
1531
1532 ssbo[0].buffer = &qbuf->buf->b.b;
1533 ssbo[0].buffer_offset = params.start_offset;
1534 ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1535
1536 if (!qbuf->previous) {
1537 ssbo[2].buffer = resource;
1538 ssbo[2].buffer_offset = offset;
1539 ssbo[2].buffer_size = 8;
1540
1541 si_resource(resource)->TC_L2_dirty = true;
1542 }
1543
1544 if (wait && qbuf == &query->buffer) {
1545 uint64_t va;
1546
1547 /* Wait for result availability. Wait only for readiness
1548 * of the last entry, since the fence writes should be
1549 * serialized in the CP.
1550 */
1551 va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1552 va += params.fence_offset;
1553
1554 si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1555 }
1556 si_launch_grid_internal_ssbos(sctx, &grid, sctx->query_result_shader,
1557 SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
1558 3, ssbo, 0x4);
1559 }
1560
1561 si_restore_qbo_state(sctx, &saved_state);
1562 pipe_resource_reference(&tmp_buffer, NULL);
1563 }
1564
si_render_condition(struct pipe_context * ctx,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1565 static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1566 enum pipe_render_cond_flag mode)
1567 {
1568 struct si_context *sctx = (struct si_context *)ctx;
1569 struct si_query_hw *squery = (struct si_query_hw *)query;
1570 struct si_atom *atom = &sctx->atoms.s.render_cond;
1571
1572 if (query) {
1573 bool needs_workaround = false;
1574
1575 /* There was a firmware regression in GFX8 which causes successive
1576 * SET_PREDICATION packets to give the wrong answer for
1577 * non-inverted stream overflow predication.
1578 */
1579 if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1580 (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1581 !condition &&
1582 (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1583 (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1584 (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1585 needs_workaround = true;
1586 }
1587
1588 if (needs_workaround && !squery->workaround_buf) {
1589 bool old_render_cond_enabled = sctx->render_cond_enabled;
1590 sctx->render_cond_enabled = false;
1591
1592 u_suballocator_alloc(&sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1593 (struct pipe_resource **)&squery->workaround_buf);
1594
1595 /* Reset to NULL to avoid a redundant SET_PREDICATION
1596 * from launching the compute grid.
1597 */
1598 sctx->render_cond = NULL;
1599
1600 ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1601 &squery->workaround_buf->b.b, squery->workaround_offset);
1602
1603 /* Settings this in the render cond atom is too late,
1604 * so set it here. */
1605 sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
1606
1607 sctx->render_cond_enabled = old_render_cond_enabled;
1608 }
1609 }
1610
1611 sctx->render_cond = query;
1612 sctx->render_cond_invert = condition;
1613 sctx->render_cond_mode = mode;
1614 sctx->render_cond_enabled = query;
1615
1616 si_set_atom_dirty(sctx, atom, query != NULL);
1617 }
1618
si_suspend_queries(struct si_context * sctx)1619 void si_suspend_queries(struct si_context *sctx)
1620 {
1621 struct si_query *query;
1622
1623 LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1624 query->ops->suspend(sctx, query);
1625 }
1626
si_resume_queries(struct si_context * sctx)1627 void si_resume_queries(struct si_context *sctx)
1628 {
1629 struct si_query *query;
1630
1631 /* Check CS space here. Resuming must not be interrupted by flushes. */
1632 si_need_gfx_cs_space(sctx, 0);
1633
1634 LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1635 query->ops->resume(sctx, query);
1636 }
1637
1638 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
1639 { \
1640 .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1641 .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_ \
1642 }
1643
1644 #define X(name_, query_type_, type_, result_type_) \
1645 XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1646
1647 #define XG(group_, name_, query_type_, type_, result_type_) \
1648 XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1649
1650 static struct pipe_driver_query_info si_driver_query_list[] = {
1651 X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1652 X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1653 X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1654 X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1655 X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
1656 X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1657 X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1658 X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1659 X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1660 X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1661 X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1662 X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1663 X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1664 X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1665 X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1666 X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1667 X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1668 X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1669 X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1670 X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1671 X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1672 X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1673 X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1674 X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1675 X("slab-wasted-VRAM", SLAB_WASTED_VRAM, BYTES, AVERAGE),
1676 X("slab-wasted-GTT", SLAB_WASTED_GTT, BYTES, AVERAGE),
1677 X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1678 X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1679 X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1680 X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1681 X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1682 X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1683 X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1684 X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1685 X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1686 X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1687 X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1688 X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1689 X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1690 X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1691 X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1692 X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1693 X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1694 X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1695
1696 /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1697 * which use it as a fallback path to detect the GPU type.
1698 *
1699 * Note: The names of these queries are significant for GPUPerfStudio
1700 * (and possibly their order as well). */
1701 XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1702 XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1703 XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1704 XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1705 XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1706
1707 X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1708 X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1709 X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1710
1711 /* The following queries must be at the end of the list because their
1712 * availability is adjusted dynamically based on the DRM version. */
1713 X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1714 X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1715 X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1716 X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1717 X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1718 X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1719 X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1720 X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1721 X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1722 X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1723 X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1724 X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1725 X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1726 X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1727
1728 /* SRBM_STATUS2 */
1729 X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1730
1731 /* CP_STAT */
1732 X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1733 X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1734 X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1735 X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1736 X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1737 X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1738 };
1739
1740 #undef X
1741 #undef XG
1742 #undef XFULL
1743
si_get_num_queries(struct si_screen * sscreen)1744 static unsigned si_get_num_queries(struct si_screen *sscreen)
1745 {
1746 /* amdgpu */
1747 if (sscreen->info.is_amdgpu) {
1748 if (sscreen->info.chip_class >= GFX8)
1749 return ARRAY_SIZE(si_driver_query_list);
1750 else
1751 return ARRAY_SIZE(si_driver_query_list) - 7;
1752 }
1753
1754 /* radeon */
1755 if (sscreen->info.has_read_registers_query) {
1756 if (sscreen->info.chip_class == GFX7)
1757 return ARRAY_SIZE(si_driver_query_list) - 6;
1758 else
1759 return ARRAY_SIZE(si_driver_query_list) - 7;
1760 }
1761
1762 return ARRAY_SIZE(si_driver_query_list) - 21;
1763 }
1764
si_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)1765 static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1766 struct pipe_driver_query_info *info)
1767 {
1768 struct si_screen *sscreen = (struct si_screen *)screen;
1769 unsigned num_queries = si_get_num_queries(sscreen);
1770
1771 if (!info) {
1772 unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1773
1774 return num_queries + num_perfcounters;
1775 }
1776
1777 if (index >= num_queries)
1778 return si_get_perfcounter_info(sscreen, index - num_queries, info);
1779
1780 *info = si_driver_query_list[index];
1781
1782 switch (info->query_type) {
1783 case SI_QUERY_REQUESTED_VRAM:
1784 case SI_QUERY_VRAM_USAGE:
1785 case SI_QUERY_MAPPED_VRAM:
1786 case SI_QUERY_SLAB_WASTED_VRAM:
1787 info->max_value.u64 = sscreen->info.vram_size;
1788 break;
1789 case SI_QUERY_REQUESTED_GTT:
1790 case SI_QUERY_GTT_USAGE:
1791 case SI_QUERY_MAPPED_GTT:
1792 case SI_QUERY_SLAB_WASTED_GTT:
1793 info->max_value.u64 = sscreen->info.gart_size;
1794 break;
1795 case SI_QUERY_GPU_TEMPERATURE:
1796 info->max_value.u64 = 125;
1797 break;
1798 case SI_QUERY_VRAM_VIS_USAGE:
1799 info->max_value.u64 = sscreen->info.vram_vis_size;
1800 break;
1801 }
1802
1803 if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1804 info->group_id += sscreen->perfcounters->base.num_groups;
1805
1806 return 1;
1807 }
1808
1809 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1810 * performance counter groups, so be careful when changing this and related
1811 * functions.
1812 */
si_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1813 static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1814 struct pipe_driver_query_group_info *info)
1815 {
1816 struct si_screen *sscreen = (struct si_screen *)screen;
1817 unsigned num_pc_groups = 0;
1818
1819 if (sscreen->perfcounters)
1820 num_pc_groups = sscreen->perfcounters->base.num_groups;
1821
1822 if (!info)
1823 return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1824
1825 if (index < num_pc_groups)
1826 return si_get_perfcounter_group_info(sscreen, index, info);
1827
1828 index -= num_pc_groups;
1829 if (index >= SI_NUM_SW_QUERY_GROUPS)
1830 return 0;
1831
1832 info->name = "GPIN";
1833 info->max_active_queries = 5;
1834 info->num_queries = 5;
1835 return 1;
1836 }
1837
si_init_query_functions(struct si_context * sctx)1838 void si_init_query_functions(struct si_context *sctx)
1839 {
1840 sctx->b.create_query = si_create_query;
1841 sctx->b.create_batch_query = si_create_batch_query;
1842 sctx->b.destroy_query = si_destroy_query;
1843 sctx->b.begin_query = si_begin_query;
1844 sctx->b.end_query = si_end_query;
1845 sctx->b.get_query_result = si_get_query_result;
1846 sctx->b.get_query_result_resource = si_get_query_result_resource;
1847
1848 if (sctx->has_graphics) {
1849 sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1850 sctx->b.render_condition = si_render_condition;
1851 }
1852
1853 list_inithead(&sctx->active_queries);
1854 }
1855
si_init_screen_query_functions(struct si_screen * sscreen)1856 void si_init_screen_query_functions(struct si_screen *sscreen)
1857 {
1858 sscreen->b.get_driver_query_info = si_get_driver_query_info;
1859 sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1860 }
1861