1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4 * Copyright 2018 Advanced Micro Devices, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "si_query.h"
28
29 #include "amd/common/sid.h"
30 #include "si_pipe.h"
31 #include "util/os_time.h"
32 #include "util/u_memory.h"
33 #include "util/u_suballoc.h"
34 #include "util/u_upload_mgr.h"
35
36 static const struct si_query_ops query_hw_ops;
37
38 struct si_hw_query_params {
39 unsigned start_offset;
40 unsigned end_offset;
41 unsigned fence_offset;
42 unsigned pair_stride;
43 unsigned pair_count;
44 };
45
46 /* Queries without buffer handling or suspend/resume. */
47 struct si_query_sw {
48 struct si_query b;
49
50 uint64_t begin_result;
51 uint64_t end_result;
52
53 uint64_t begin_time;
54 uint64_t end_time;
55
56 /* Fence for GPU_FINISHED. */
57 struct pipe_fence_handle *fence;
58 };
59
si_query_sw_destroy(struct si_context * sctx,struct si_query * squery)60 static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
61 {
62 struct si_query_sw *query = (struct si_query_sw *)squery;
63
64 sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
65 FREE(query);
66 }
67
winsys_id_from_type(unsigned type)68 static enum radeon_value_id winsys_id_from_type(unsigned type)
69 {
70 switch (type) {
71 case SI_QUERY_REQUESTED_VRAM:
72 return RADEON_REQUESTED_VRAM_MEMORY;
73 case SI_QUERY_REQUESTED_GTT:
74 return RADEON_REQUESTED_GTT_MEMORY;
75 case SI_QUERY_MAPPED_VRAM:
76 return RADEON_MAPPED_VRAM;
77 case SI_QUERY_MAPPED_GTT:
78 return RADEON_MAPPED_GTT;
79 case SI_QUERY_BUFFER_WAIT_TIME:
80 return RADEON_BUFFER_WAIT_TIME_NS;
81 case SI_QUERY_NUM_MAPPED_BUFFERS:
82 return RADEON_NUM_MAPPED_BUFFERS;
83 case SI_QUERY_NUM_GFX_IBS:
84 return RADEON_NUM_GFX_IBS;
85 case SI_QUERY_NUM_SDMA_IBS:
86 return RADEON_NUM_SDMA_IBS;
87 case SI_QUERY_GFX_BO_LIST_SIZE:
88 return RADEON_GFX_BO_LIST_COUNTER;
89 case SI_QUERY_GFX_IB_SIZE:
90 return RADEON_GFX_IB_SIZE_COUNTER;
91 case SI_QUERY_NUM_BYTES_MOVED:
92 return RADEON_NUM_BYTES_MOVED;
93 case SI_QUERY_NUM_EVICTIONS:
94 return RADEON_NUM_EVICTIONS;
95 case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
96 return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
97 case SI_QUERY_VRAM_USAGE:
98 return RADEON_VRAM_USAGE;
99 case SI_QUERY_VRAM_VIS_USAGE:
100 return RADEON_VRAM_VIS_USAGE;
101 case SI_QUERY_GTT_USAGE:
102 return RADEON_GTT_USAGE;
103 case SI_QUERY_GPU_TEMPERATURE:
104 return RADEON_GPU_TEMPERATURE;
105 case SI_QUERY_CURRENT_GPU_SCLK:
106 return RADEON_CURRENT_SCLK;
107 case SI_QUERY_CURRENT_GPU_MCLK:
108 return RADEON_CURRENT_MCLK;
109 case SI_QUERY_CS_THREAD_BUSY:
110 return RADEON_CS_THREAD_TIME;
111 default:
112 unreachable("query type does not correspond to winsys id");
113 }
114 }
115
si_finish_dma_get_cpu_time(struct si_context * sctx)116 static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx)
117 {
118 struct pipe_fence_handle *fence = NULL;
119
120 si_flush_dma_cs(sctx, 0, &fence);
121 if (fence) {
122 sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
123 sctx->ws->fence_reference(&fence, NULL);
124 }
125
126 return os_time_get_nano();
127 }
128
si_query_sw_begin(struct si_context * sctx,struct si_query * squery)129 static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
130 {
131 struct si_query_sw *query = (struct si_query_sw *)squery;
132 enum radeon_value_id ws_id;
133
134 switch (query->b.type) {
135 case PIPE_QUERY_TIMESTAMP_DISJOINT:
136 case PIPE_QUERY_GPU_FINISHED:
137 break;
138 case SI_QUERY_TIME_ELAPSED_SDMA_SI:
139 query->begin_result = si_finish_dma_get_cpu_time(sctx);
140 break;
141 case SI_QUERY_DRAW_CALLS:
142 query->begin_result = sctx->num_draw_calls;
143 break;
144 case SI_QUERY_DECOMPRESS_CALLS:
145 query->begin_result = sctx->num_decompress_calls;
146 break;
147 case SI_QUERY_MRT_DRAW_CALLS:
148 query->begin_result = sctx->num_mrt_draw_calls;
149 break;
150 case SI_QUERY_PRIM_RESTART_CALLS:
151 query->begin_result = sctx->num_prim_restart_calls;
152 break;
153 case SI_QUERY_SPILL_DRAW_CALLS:
154 query->begin_result = sctx->num_spill_draw_calls;
155 break;
156 case SI_QUERY_COMPUTE_CALLS:
157 query->begin_result = sctx->num_compute_calls;
158 break;
159 case SI_QUERY_SPILL_COMPUTE_CALLS:
160 query->begin_result = sctx->num_spill_compute_calls;
161 break;
162 case SI_QUERY_DMA_CALLS:
163 query->begin_result = sctx->num_dma_calls;
164 break;
165 case SI_QUERY_CP_DMA_CALLS:
166 query->begin_result = sctx->num_cp_dma_calls;
167 break;
168 case SI_QUERY_NUM_VS_FLUSHES:
169 query->begin_result = sctx->num_vs_flushes;
170 break;
171 case SI_QUERY_NUM_PS_FLUSHES:
172 query->begin_result = sctx->num_ps_flushes;
173 break;
174 case SI_QUERY_NUM_CS_FLUSHES:
175 query->begin_result = sctx->num_cs_flushes;
176 break;
177 case SI_QUERY_NUM_CB_CACHE_FLUSHES:
178 query->begin_result = sctx->num_cb_cache_flushes;
179 break;
180 case SI_QUERY_NUM_DB_CACHE_FLUSHES:
181 query->begin_result = sctx->num_db_cache_flushes;
182 break;
183 case SI_QUERY_NUM_L2_INVALIDATES:
184 query->begin_result = sctx->num_L2_invalidates;
185 break;
186 case SI_QUERY_NUM_L2_WRITEBACKS:
187 query->begin_result = sctx->num_L2_writebacks;
188 break;
189 case SI_QUERY_NUM_RESIDENT_HANDLES:
190 query->begin_result = sctx->num_resident_handles;
191 break;
192 case SI_QUERY_TC_OFFLOADED_SLOTS:
193 query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
194 break;
195 case SI_QUERY_TC_DIRECT_SLOTS:
196 query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
197 break;
198 case SI_QUERY_TC_NUM_SYNCS:
199 query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
200 break;
201 case SI_QUERY_REQUESTED_VRAM:
202 case SI_QUERY_REQUESTED_GTT:
203 case SI_QUERY_MAPPED_VRAM:
204 case SI_QUERY_MAPPED_GTT:
205 case SI_QUERY_VRAM_USAGE:
206 case SI_QUERY_VRAM_VIS_USAGE:
207 case SI_QUERY_GTT_USAGE:
208 case SI_QUERY_GPU_TEMPERATURE:
209 case SI_QUERY_CURRENT_GPU_SCLK:
210 case SI_QUERY_CURRENT_GPU_MCLK:
211 case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
212 case SI_QUERY_NUM_MAPPED_BUFFERS:
213 query->begin_result = 0;
214 break;
215 case SI_QUERY_BUFFER_WAIT_TIME:
216 case SI_QUERY_GFX_IB_SIZE:
217 case SI_QUERY_NUM_GFX_IBS:
218 case SI_QUERY_NUM_SDMA_IBS:
219 case SI_QUERY_NUM_BYTES_MOVED:
220 case SI_QUERY_NUM_EVICTIONS:
221 case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
222 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
223 query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
224 break;
225 }
226 case SI_QUERY_GFX_BO_LIST_SIZE:
227 ws_id = winsys_id_from_type(query->b.type);
228 query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
229 query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
230 break;
231 case SI_QUERY_CS_THREAD_BUSY:
232 ws_id = winsys_id_from_type(query->b.type);
233 query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
234 query->begin_time = os_time_get_nano();
235 break;
236 case SI_QUERY_GALLIUM_THREAD_BUSY:
237 query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
238 query->begin_time = os_time_get_nano();
239 break;
240 case SI_QUERY_GPU_LOAD:
241 case SI_QUERY_GPU_SHADERS_BUSY:
242 case SI_QUERY_GPU_TA_BUSY:
243 case SI_QUERY_GPU_GDS_BUSY:
244 case SI_QUERY_GPU_VGT_BUSY:
245 case SI_QUERY_GPU_IA_BUSY:
246 case SI_QUERY_GPU_SX_BUSY:
247 case SI_QUERY_GPU_WD_BUSY:
248 case SI_QUERY_GPU_BCI_BUSY:
249 case SI_QUERY_GPU_SC_BUSY:
250 case SI_QUERY_GPU_PA_BUSY:
251 case SI_QUERY_GPU_DB_BUSY:
252 case SI_QUERY_GPU_CP_BUSY:
253 case SI_QUERY_GPU_CB_BUSY:
254 case SI_QUERY_GPU_SDMA_BUSY:
255 case SI_QUERY_GPU_PFP_BUSY:
256 case SI_QUERY_GPU_MEQ_BUSY:
257 case SI_QUERY_GPU_ME_BUSY:
258 case SI_QUERY_GPU_SURF_SYNC_BUSY:
259 case SI_QUERY_GPU_CP_DMA_BUSY:
260 case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
261 query->begin_result = si_begin_counter(sctx->screen, query->b.type);
262 break;
263 case SI_QUERY_NUM_COMPILATIONS:
264 query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
265 break;
266 case SI_QUERY_NUM_SHADERS_CREATED:
267 query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
268 break;
269 case SI_QUERY_LIVE_SHADER_CACHE_HITS:
270 query->begin_result = sctx->screen->live_shader_cache.hits;
271 break;
272 case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
273 query->begin_result = sctx->screen->live_shader_cache.misses;
274 break;
275 case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
276 query->begin_result = sctx->screen->num_memory_shader_cache_hits;
277 break;
278 case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
279 query->begin_result = sctx->screen->num_memory_shader_cache_misses;
280 break;
281 case SI_QUERY_DISK_SHADER_CACHE_HITS:
282 query->begin_result = sctx->screen->num_disk_shader_cache_hits;
283 break;
284 case SI_QUERY_DISK_SHADER_CACHE_MISSES:
285 query->begin_result = sctx->screen->num_disk_shader_cache_misses;
286 break;
287 case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
288 query->begin_result = sctx->compute_num_verts_accepted;
289 break;
290 case SI_QUERY_PD_NUM_PRIMS_REJECTED:
291 query->begin_result = sctx->compute_num_verts_rejected;
292 break;
293 case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
294 query->begin_result = sctx->compute_num_verts_ineligible;
295 break;
296 case SI_QUERY_GPIN_ASIC_ID:
297 case SI_QUERY_GPIN_NUM_SIMD:
298 case SI_QUERY_GPIN_NUM_RB:
299 case SI_QUERY_GPIN_NUM_SPI:
300 case SI_QUERY_GPIN_NUM_SE:
301 break;
302 default:
303 unreachable("si_query_sw_begin: bad query type");
304 }
305
306 return true;
307 }
308
si_query_sw_end(struct si_context * sctx,struct si_query * squery)309 static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
310 {
311 struct si_query_sw *query = (struct si_query_sw *)squery;
312 enum radeon_value_id ws_id;
313
314 switch (query->b.type) {
315 case PIPE_QUERY_TIMESTAMP_DISJOINT:
316 break;
317 case PIPE_QUERY_GPU_FINISHED:
318 sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
319 break;
320 case SI_QUERY_TIME_ELAPSED_SDMA_SI:
321 query->end_result = si_finish_dma_get_cpu_time(sctx);
322 break;
323 case SI_QUERY_DRAW_CALLS:
324 query->end_result = sctx->num_draw_calls;
325 break;
326 case SI_QUERY_DECOMPRESS_CALLS:
327 query->end_result = sctx->num_decompress_calls;
328 break;
329 case SI_QUERY_MRT_DRAW_CALLS:
330 query->end_result = sctx->num_mrt_draw_calls;
331 break;
332 case SI_QUERY_PRIM_RESTART_CALLS:
333 query->end_result = sctx->num_prim_restart_calls;
334 break;
335 case SI_QUERY_SPILL_DRAW_CALLS:
336 query->end_result = sctx->num_spill_draw_calls;
337 break;
338 case SI_QUERY_COMPUTE_CALLS:
339 query->end_result = sctx->num_compute_calls;
340 break;
341 case SI_QUERY_SPILL_COMPUTE_CALLS:
342 query->end_result = sctx->num_spill_compute_calls;
343 break;
344 case SI_QUERY_DMA_CALLS:
345 query->end_result = sctx->num_dma_calls;
346 break;
347 case SI_QUERY_CP_DMA_CALLS:
348 query->end_result = sctx->num_cp_dma_calls;
349 break;
350 case SI_QUERY_NUM_VS_FLUSHES:
351 query->end_result = sctx->num_vs_flushes;
352 break;
353 case SI_QUERY_NUM_PS_FLUSHES:
354 query->end_result = sctx->num_ps_flushes;
355 break;
356 case SI_QUERY_NUM_CS_FLUSHES:
357 query->end_result = sctx->num_cs_flushes;
358 break;
359 case SI_QUERY_NUM_CB_CACHE_FLUSHES:
360 query->end_result = sctx->num_cb_cache_flushes;
361 break;
362 case SI_QUERY_NUM_DB_CACHE_FLUSHES:
363 query->end_result = sctx->num_db_cache_flushes;
364 break;
365 case SI_QUERY_NUM_L2_INVALIDATES:
366 query->end_result = sctx->num_L2_invalidates;
367 break;
368 case SI_QUERY_NUM_L2_WRITEBACKS:
369 query->end_result = sctx->num_L2_writebacks;
370 break;
371 case SI_QUERY_NUM_RESIDENT_HANDLES:
372 query->end_result = sctx->num_resident_handles;
373 break;
374 case SI_QUERY_TC_OFFLOADED_SLOTS:
375 query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
376 break;
377 case SI_QUERY_TC_DIRECT_SLOTS:
378 query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
379 break;
380 case SI_QUERY_TC_NUM_SYNCS:
381 query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
382 break;
383 case SI_QUERY_REQUESTED_VRAM:
384 case SI_QUERY_REQUESTED_GTT:
385 case SI_QUERY_MAPPED_VRAM:
386 case SI_QUERY_MAPPED_GTT:
387 case SI_QUERY_VRAM_USAGE:
388 case SI_QUERY_VRAM_VIS_USAGE:
389 case SI_QUERY_GTT_USAGE:
390 case SI_QUERY_GPU_TEMPERATURE:
391 case SI_QUERY_CURRENT_GPU_SCLK:
392 case SI_QUERY_CURRENT_GPU_MCLK:
393 case SI_QUERY_BUFFER_WAIT_TIME:
394 case SI_QUERY_GFX_IB_SIZE:
395 case SI_QUERY_NUM_MAPPED_BUFFERS:
396 case SI_QUERY_NUM_GFX_IBS:
397 case SI_QUERY_NUM_SDMA_IBS:
398 case SI_QUERY_NUM_BYTES_MOVED:
399 case SI_QUERY_NUM_EVICTIONS:
400 case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
401 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
402 query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
403 break;
404 }
405 case SI_QUERY_GFX_BO_LIST_SIZE:
406 ws_id = winsys_id_from_type(query->b.type);
407 query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
408 query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
409 break;
410 case SI_QUERY_CS_THREAD_BUSY:
411 ws_id = winsys_id_from_type(query->b.type);
412 query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
413 query->end_time = os_time_get_nano();
414 break;
415 case SI_QUERY_GALLIUM_THREAD_BUSY:
416 query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
417 query->end_time = os_time_get_nano();
418 break;
419 case SI_QUERY_GPU_LOAD:
420 case SI_QUERY_GPU_SHADERS_BUSY:
421 case SI_QUERY_GPU_TA_BUSY:
422 case SI_QUERY_GPU_GDS_BUSY:
423 case SI_QUERY_GPU_VGT_BUSY:
424 case SI_QUERY_GPU_IA_BUSY:
425 case SI_QUERY_GPU_SX_BUSY:
426 case SI_QUERY_GPU_WD_BUSY:
427 case SI_QUERY_GPU_BCI_BUSY:
428 case SI_QUERY_GPU_SC_BUSY:
429 case SI_QUERY_GPU_PA_BUSY:
430 case SI_QUERY_GPU_DB_BUSY:
431 case SI_QUERY_GPU_CP_BUSY:
432 case SI_QUERY_GPU_CB_BUSY:
433 case SI_QUERY_GPU_SDMA_BUSY:
434 case SI_QUERY_GPU_PFP_BUSY:
435 case SI_QUERY_GPU_MEQ_BUSY:
436 case SI_QUERY_GPU_ME_BUSY:
437 case SI_QUERY_GPU_SURF_SYNC_BUSY:
438 case SI_QUERY_GPU_CP_DMA_BUSY:
439 case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
440 query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
441 query->begin_result = 0;
442 break;
443 case SI_QUERY_NUM_COMPILATIONS:
444 query->end_result = p_atomic_read(&sctx->screen->num_compilations);
445 break;
446 case SI_QUERY_NUM_SHADERS_CREATED:
447 query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
448 break;
449 case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
450 query->end_result = sctx->last_tex_ps_draw_ratio;
451 break;
452 case SI_QUERY_LIVE_SHADER_CACHE_HITS:
453 query->end_result = sctx->screen->live_shader_cache.hits;
454 break;
455 case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
456 query->end_result = sctx->screen->live_shader_cache.misses;
457 break;
458 case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
459 query->end_result = sctx->screen->num_memory_shader_cache_hits;
460 break;
461 case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
462 query->end_result = sctx->screen->num_memory_shader_cache_misses;
463 break;
464 case SI_QUERY_DISK_SHADER_CACHE_HITS:
465 query->end_result = sctx->screen->num_disk_shader_cache_hits;
466 break;
467 case SI_QUERY_DISK_SHADER_CACHE_MISSES:
468 query->end_result = sctx->screen->num_disk_shader_cache_misses;
469 break;
470 case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
471 query->end_result = sctx->compute_num_verts_accepted;
472 break;
473 case SI_QUERY_PD_NUM_PRIMS_REJECTED:
474 query->end_result = sctx->compute_num_verts_rejected;
475 break;
476 case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
477 query->end_result = sctx->compute_num_verts_ineligible;
478 break;
479 case SI_QUERY_GPIN_ASIC_ID:
480 case SI_QUERY_GPIN_NUM_SIMD:
481 case SI_QUERY_GPIN_NUM_RB:
482 case SI_QUERY_GPIN_NUM_SPI:
483 case SI_QUERY_GPIN_NUM_SE:
484 break;
485 default:
486 unreachable("si_query_sw_end: bad query type");
487 }
488
489 return true;
490 }
491
si_query_sw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)492 static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
493 union pipe_query_result *result)
494 {
495 struct si_query_sw *query = (struct si_query_sw *)squery;
496
497 switch (query->b.type) {
498 case PIPE_QUERY_TIMESTAMP_DISJOINT:
499 /* Convert from cycles per millisecond to cycles per second (Hz). */
500 result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
501 result->timestamp_disjoint.disjoint = false;
502 return true;
503 case PIPE_QUERY_GPU_FINISHED: {
504 struct pipe_screen *screen = sctx->b.screen;
505 struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
506
507 result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
508 return result->b;
509 }
510
511 case SI_QUERY_GFX_BO_LIST_SIZE:
512 result->u64 =
513 (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
514 return true;
515 case SI_QUERY_CS_THREAD_BUSY:
516 case SI_QUERY_GALLIUM_THREAD_BUSY:
517 result->u64 =
518 (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
519 return true;
520 case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
521 case SI_QUERY_PD_NUM_PRIMS_REJECTED:
522 case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
523 result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
524 return true;
525 case SI_QUERY_GPIN_ASIC_ID:
526 result->u32 = 0;
527 return true;
528 case SI_QUERY_GPIN_NUM_SIMD:
529 result->u32 = sctx->screen->info.num_good_compute_units;
530 return true;
531 case SI_QUERY_GPIN_NUM_RB:
532 result->u32 = sctx->screen->info.num_render_backends;
533 return true;
534 case SI_QUERY_GPIN_NUM_SPI:
535 result->u32 = 1; /* all supported chips have one SPI per SE */
536 return true;
537 case SI_QUERY_GPIN_NUM_SE:
538 result->u32 = sctx->screen->info.max_se;
539 return true;
540 }
541
542 result->u64 = query->end_result - query->begin_result;
543
544 switch (query->b.type) {
545 case SI_QUERY_BUFFER_WAIT_TIME:
546 case SI_QUERY_GPU_TEMPERATURE:
547 result->u64 /= 1000;
548 break;
549 case SI_QUERY_CURRENT_GPU_SCLK:
550 case SI_QUERY_CURRENT_GPU_MCLK:
551 result->u64 *= 1000000;
552 break;
553 }
554
555 return true;
556 }
557
558 static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
559 .begin = si_query_sw_begin,
560 .end = si_query_sw_end,
561 .get_result = si_query_sw_get_result,
562 .get_result_resource = NULL};
563
si_query_sw_create(unsigned query_type)564 static struct pipe_query *si_query_sw_create(unsigned query_type)
565 {
566 struct si_query_sw *query;
567
568 query = CALLOC_STRUCT(si_query_sw);
569 if (!query)
570 return NULL;
571
572 query->b.type = query_type;
573 query->b.ops = &sw_query_ops;
574
575 return (struct pipe_query *)query;
576 }
577
si_query_buffer_destroy(struct si_screen * sscreen,struct si_query_buffer * buffer)578 void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
579 {
580 struct si_query_buffer *prev = buffer->previous;
581
582 /* Release all query buffers. */
583 while (prev) {
584 struct si_query_buffer *qbuf = prev;
585 prev = prev->previous;
586 si_resource_reference(&qbuf->buf, NULL);
587 FREE(qbuf);
588 }
589
590 si_resource_reference(&buffer->buf, NULL);
591 }
592
si_query_buffer_reset(struct si_context * sctx,struct si_query_buffer * buffer)593 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
594 {
595 /* Discard all query buffers except for the oldest. */
596 while (buffer->previous) {
597 struct si_query_buffer *qbuf = buffer->previous;
598 buffer->previous = qbuf->previous;
599
600 si_resource_reference(&buffer->buf, NULL);
601 buffer->buf = qbuf->buf; /* move ownership */
602 FREE(qbuf);
603 }
604 buffer->results_end = 0;
605
606 if (!buffer->buf)
607 return;
608
609 /* Discard even the oldest buffer if it can't be mapped without a stall. */
610 if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
611 !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
612 si_resource_reference(&buffer->buf, NULL);
613 } else {
614 buffer->unprepared = true;
615 }
616 }
617
si_query_buffer_alloc(struct si_context * sctx,struct si_query_buffer * buffer,bool (* prepare_buffer)(struct si_context *,struct si_query_buffer *),unsigned size)618 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
619 bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
620 unsigned size)
621 {
622 bool unprepared = buffer->unprepared;
623 buffer->unprepared = false;
624
625 if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
626 if (buffer->buf) {
627 struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
628 memcpy(qbuf, buffer, sizeof(*qbuf));
629 buffer->previous = qbuf;
630 }
631 buffer->results_end = 0;
632
633 /* Queries are normally read by the CPU after
634 * being written by the gpu, hence staging is probably a good
635 * usage pattern.
636 */
637 struct si_screen *screen = sctx->screen;
638 unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
639 buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
640 if (unlikely(!buffer->buf))
641 return false;
642 unprepared = true;
643 }
644
645 if (unprepared && prepare_buffer) {
646 if (unlikely(!prepare_buffer(sctx, buffer))) {
647 si_resource_reference(&buffer->buf, NULL);
648 return false;
649 }
650 }
651
652 return true;
653 }
654
si_query_hw_destroy(struct si_context * sctx,struct si_query * squery)655 void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
656 {
657 struct si_query_hw *query = (struct si_query_hw *)squery;
658
659 si_query_buffer_destroy(sctx->screen, &query->buffer);
660 si_resource_reference(&query->workaround_buf, NULL);
661 FREE(squery);
662 }
663
si_query_hw_prepare_buffer(struct si_context * sctx,struct si_query_buffer * qbuf)664 static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
665 {
666 static const struct si_query_hw si_query_hw_s;
667 struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer);
668 struct si_screen *screen = sctx->screen;
669
670 /* The caller ensures that the buffer is currently unused by the GPU. */
671 uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL,
672 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
673 if (!results)
674 return false;
675
676 memset(results, 0, qbuf->buf->b.b.width0);
677
678 if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
679 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
680 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
681 unsigned max_rbs = screen->info.num_render_backends;
682 unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
683 unsigned num_results;
684 unsigned i, j;
685
686 /* Set top bits for unused backends. */
687 num_results = qbuf->buf->b.b.width0 / query->result_size;
688 for (j = 0; j < num_results; j++) {
689 for (i = 0; i < max_rbs; i++) {
690 if (!(enabled_rb_mask & (1 << i))) {
691 results[(i * 4) + 1] = 0x80000000;
692 results[(i * 4) + 3] = 0x80000000;
693 }
694 }
695 results += 4 * max_rbs;
696 }
697 }
698
699 return true;
700 }
701
702 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
703 bool wait, enum pipe_query_value_type result_type,
704 int index, struct pipe_resource *resource,
705 unsigned offset);
706
707 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
708 struct si_resource *buffer, uint64_t va);
709 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
710 struct si_resource *buffer, uint64_t va);
711 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
712 union pipe_query_result *result);
713 static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
714
715 static struct si_query_hw_ops query_hw_default_hw_ops = {
716 .prepare_buffer = si_query_hw_prepare_buffer,
717 .emit_start = si_query_hw_do_emit_start,
718 .emit_stop = si_query_hw_do_emit_stop,
719 .clear_result = si_query_hw_clear_result,
720 .add_result = si_query_hw_add_result,
721 };
722
si_query_hw_create(struct si_screen * sscreen,unsigned query_type,unsigned index)723 static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
724 unsigned index)
725 {
726 struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
727 if (!query)
728 return NULL;
729
730 query->b.type = query_type;
731 query->b.ops = &query_hw_ops;
732 query->ops = &query_hw_default_hw_ops;
733
734 switch (query_type) {
735 case PIPE_QUERY_OCCLUSION_COUNTER:
736 case PIPE_QUERY_OCCLUSION_PREDICATE:
737 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
738 query->result_size = 16 * sscreen->info.num_render_backends;
739 query->result_size += 16; /* for the fence + alignment */
740 query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
741 break;
742 case SI_QUERY_TIME_ELAPSED_SDMA:
743 /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
744 query->result_size = 64;
745 break;
746 case PIPE_QUERY_TIME_ELAPSED:
747 query->result_size = 24;
748 query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
749 break;
750 case PIPE_QUERY_TIMESTAMP:
751 query->result_size = 16;
752 query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
753 query->flags = SI_QUERY_HW_FLAG_NO_START;
754 break;
755 case PIPE_QUERY_PRIMITIVES_EMITTED:
756 case PIPE_QUERY_PRIMITIVES_GENERATED:
757 case PIPE_QUERY_SO_STATISTICS:
758 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
759 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
760 query->result_size = 32;
761 query->b.num_cs_dw_suspend = 6;
762 query->stream = index;
763 break;
764 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
765 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
766 query->result_size = 32 * SI_MAX_STREAMS;
767 query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
768 break;
769 case PIPE_QUERY_PIPELINE_STATISTICS:
770 /* 11 values on GCN. */
771 query->result_size = 11 * 16;
772 query->result_size += 8; /* for the fence + alignment */
773 query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
774 break;
775 default:
776 assert(0);
777 FREE(query);
778 return NULL;
779 }
780
781 return (struct pipe_query *)query;
782 }
783
si_update_occlusion_query_state(struct si_context * sctx,unsigned type,int diff)784 static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
785 {
786 if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
787 type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
788 bool old_enable = sctx->num_occlusion_queries != 0;
789 bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
790 bool enable, perfect_enable;
791
792 sctx->num_occlusion_queries += diff;
793 assert(sctx->num_occlusion_queries >= 0);
794
795 if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
796 sctx->num_perfect_occlusion_queries += diff;
797 assert(sctx->num_perfect_occlusion_queries >= 0);
798 }
799
800 enable = sctx->num_occlusion_queries != 0;
801 perfect_enable = sctx->num_perfect_occlusion_queries != 0;
802
803 if (enable != old_enable || perfect_enable != old_perfect_enable) {
804 si_set_occlusion_query_state(sctx, old_perfect_enable);
805 }
806 }
807 }
808
event_type_for_stream(unsigned stream)809 static unsigned event_type_for_stream(unsigned stream)
810 {
811 switch (stream) {
812 default:
813 case 0:
814 return V_028A90_SAMPLE_STREAMOUTSTATS;
815 case 1:
816 return V_028A90_SAMPLE_STREAMOUTSTATS1;
817 case 2:
818 return V_028A90_SAMPLE_STREAMOUTSTATS2;
819 case 3:
820 return V_028A90_SAMPLE_STREAMOUTSTATS3;
821 }
822 }
823
emit_sample_streamout(struct radeon_cmdbuf * cs,uint64_t va,unsigned stream)824 static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
825 {
826 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
827 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
828 radeon_emit(cs, va);
829 radeon_emit(cs, va >> 32);
830 }
831
si_query_hw_do_emit_start(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)832 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
833 struct si_resource *buffer, uint64_t va)
834 {
835 struct radeon_cmdbuf *cs = sctx->gfx_cs;
836
837 switch (query->b.type) {
838 case SI_QUERY_TIME_ELAPSED_SDMA:
839 si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
840 return;
841 case PIPE_QUERY_OCCLUSION_COUNTER:
842 case PIPE_QUERY_OCCLUSION_PREDICATE:
843 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
844 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
845 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
846 radeon_emit(cs, va);
847 radeon_emit(cs, va >> 32);
848 break;
849 case PIPE_QUERY_PRIMITIVES_EMITTED:
850 case PIPE_QUERY_PRIMITIVES_GENERATED:
851 case PIPE_QUERY_SO_STATISTICS:
852 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
853 emit_sample_streamout(cs, va, query->stream);
854 break;
855 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
856 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
857 emit_sample_streamout(cs, va + 32 * stream, stream);
858 break;
859 case PIPE_QUERY_TIME_ELAPSED:
860 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
861 EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
862 break;
863 case PIPE_QUERY_PIPELINE_STATISTICS:
864 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
865 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
866 radeon_emit(cs, va);
867 radeon_emit(cs, va >> 32);
868 break;
869 default:
870 assert(0);
871 }
872 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
873 RADEON_PRIO_QUERY);
874 }
875
si_query_hw_emit_start(struct si_context * sctx,struct si_query_hw * query)876 static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
877 {
878 uint64_t va;
879
880 if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
881 return;
882
883 si_update_occlusion_query_state(sctx, query->b.type, 1);
884 si_update_prims_generated_query_state(sctx, query->b.type, 1);
885
886 if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
887 sctx->num_pipeline_stat_queries++;
888
889 if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
890 si_need_gfx_cs_space(sctx, 0);
891
892 va = query->buffer.buf->gpu_address + query->buffer.results_end;
893 query->ops->emit_start(sctx, query, query->buffer.buf, va);
894 }
895
si_query_hw_do_emit_stop(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)896 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
897 struct si_resource *buffer, uint64_t va)
898 {
899 struct radeon_cmdbuf *cs = sctx->gfx_cs;
900 uint64_t fence_va = 0;
901
902 switch (query->b.type) {
903 case SI_QUERY_TIME_ELAPSED_SDMA:
904 si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
905 return;
906 case PIPE_QUERY_OCCLUSION_COUNTER:
907 case PIPE_QUERY_OCCLUSION_PREDICATE:
908 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
909 va += 8;
910 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
911 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
912 radeon_emit(cs, va);
913 radeon_emit(cs, va >> 32);
914
915 fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
916 break;
917 case PIPE_QUERY_PRIMITIVES_EMITTED:
918 case PIPE_QUERY_PRIMITIVES_GENERATED:
919 case PIPE_QUERY_SO_STATISTICS:
920 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
921 va += 16;
922 emit_sample_streamout(cs, va, query->stream);
923 break;
924 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
925 va += 16;
926 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
927 emit_sample_streamout(cs, va + 32 * stream, stream);
928 break;
929 case PIPE_QUERY_TIME_ELAPSED:
930 va += 8;
931 /* fall through */
932 case PIPE_QUERY_TIMESTAMP:
933 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
934 EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
935 fence_va = va + 8;
936 break;
937 case PIPE_QUERY_PIPELINE_STATISTICS: {
938 unsigned sample_size = (query->result_size - 8) / 2;
939
940 va += sample_size;
941 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
942 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
943 radeon_emit(cs, va);
944 radeon_emit(cs, va >> 32);
945
946 fence_va = va + sample_size;
947 break;
948 }
949 default:
950 assert(0);
951 }
952 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
953 RADEON_PRIO_QUERY);
954
955 if (fence_va) {
956 si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
957 EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
958 query->b.type);
959 }
960 }
961
si_query_hw_emit_stop(struct si_context * sctx,struct si_query_hw * query)962 static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
963 {
964 uint64_t va;
965
966 /* The queries which need begin already called this in begin_query. */
967 if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
968 si_need_gfx_cs_space(sctx, 0);
969 if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
970 query->result_size))
971 return;
972 }
973
974 if (!query->buffer.buf)
975 return; // previous buffer allocation failure
976
977 /* emit end query */
978 va = query->buffer.buf->gpu_address + query->buffer.results_end;
979
980 query->ops->emit_stop(sctx, query, query->buffer.buf, va);
981
982 query->buffer.results_end += query->result_size;
983
984 si_update_occlusion_query_state(sctx, query->b.type, -1);
985 si_update_prims_generated_query_state(sctx, query->b.type, -1);
986
987 if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
988 sctx->num_pipeline_stat_queries--;
989 }
990
emit_set_predicate(struct si_context * ctx,struct si_resource * buf,uint64_t va,uint32_t op)991 static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
992 uint32_t op)
993 {
994 struct radeon_cmdbuf *cs = ctx->gfx_cs;
995
996 if (ctx->chip_class >= GFX9) {
997 radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
998 radeon_emit(cs, op);
999 radeon_emit(cs, va);
1000 radeon_emit(cs, va >> 32);
1001 } else {
1002 radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
1003 radeon_emit(cs, va);
1004 radeon_emit(cs, op | ((va >> 32) & 0xFF));
1005 }
1006 radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY);
1007 }
1008
si_emit_query_predication(struct si_context * ctx)1009 static void si_emit_query_predication(struct si_context *ctx)
1010 {
1011 struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
1012 struct si_query_buffer *qbuf;
1013 uint32_t op;
1014 bool flag_wait, invert;
1015
1016 if (!query)
1017 return;
1018
1019 if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1020 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
1021 assert(!"not implemented");
1022 }
1023
1024 invert = ctx->render_cond_invert;
1025 flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
1026 ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
1027
1028 if (query->workaround_buf) {
1029 op = PRED_OP(PREDICATION_OP_BOOL64);
1030 } else {
1031 switch (query->b.type) {
1032 case PIPE_QUERY_OCCLUSION_COUNTER:
1033 case PIPE_QUERY_OCCLUSION_PREDICATE:
1034 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1035 op = PRED_OP(PREDICATION_OP_ZPASS);
1036 break;
1037 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1038 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1039 op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1040 invert = !invert;
1041 break;
1042 default:
1043 assert(0);
1044 return;
1045 }
1046 }
1047
1048 /* if true then invert, see GL_ARB_conditional_render_inverted */
1049 if (invert)
1050 op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1051 else
1052 op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1053
1054 /* Use the value written by compute shader as a workaround. Note that
1055 * the wait flag does not apply in this predication mode.
1056 *
1057 * The shader outputs the result value to L2. Workarounds only affect GFX8
1058 * and later, where the CP reads data from L2, so we don't need an
1059 * additional flush.
1060 */
1061 if (query->workaround_buf) {
1062 uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1063 emit_set_predicate(ctx, query->workaround_buf, va, op);
1064 return;
1065 }
1066
1067 op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1068
1069 /* emit predicate packets for all data blocks */
1070 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1071 unsigned results_base = 0;
1072 uint64_t va_base = qbuf->buf->gpu_address;
1073
1074 while (results_base < qbuf->results_end) {
1075 uint64_t va = va_base + results_base;
1076
1077 if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1078 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1079 emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1080
1081 /* set CONTINUE bit for all packets except the first */
1082 op |= PREDICATION_CONTINUE;
1083 }
1084 } else {
1085 emit_set_predicate(ctx, qbuf->buf, va, op);
1086 op |= PREDICATION_CONTINUE;
1087 }
1088
1089 results_base += query->result_size;
1090 }
1091 }
1092 }
1093
si_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)1094 static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1095 unsigned index)
1096 {
1097 struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1098
1099 if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1100 (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && query_type != SI_QUERY_TIME_ELAPSED_SDMA))
1101 return si_query_sw_create(query_type);
1102
1103 if (sscreen->use_ngg_streamout &&
1104 (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1105 query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1106 query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1107 query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1108 return gfx10_sh_query_create(sscreen, query_type, index);
1109
1110 return si_query_hw_create(sscreen, query_type, index);
1111 }
1112
si_destroy_query(struct pipe_context * ctx,struct pipe_query * query)1113 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1114 {
1115 struct si_context *sctx = (struct si_context *)ctx;
1116 struct si_query *squery = (struct si_query *)query;
1117
1118 squery->ops->destroy(sctx, squery);
1119 }
1120
si_begin_query(struct pipe_context * ctx,struct pipe_query * query)1121 static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1122 {
1123 struct si_context *sctx = (struct si_context *)ctx;
1124 struct si_query *squery = (struct si_query *)query;
1125
1126 return squery->ops->begin(sctx, squery);
1127 }
1128
si_query_hw_begin(struct si_context * sctx,struct si_query * squery)1129 bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1130 {
1131 struct si_query_hw *query = (struct si_query_hw *)squery;
1132
1133 if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1134 assert(0);
1135 return false;
1136 }
1137
1138 if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1139 si_query_buffer_reset(sctx, &query->buffer);
1140
1141 si_resource_reference(&query->workaround_buf, NULL);
1142
1143 si_query_hw_emit_start(sctx, query);
1144 if (!query->buffer.buf)
1145 return false;
1146
1147 list_addtail(&query->b.active_list, &sctx->active_queries);
1148 sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1149 return true;
1150 }
1151
si_end_query(struct pipe_context * ctx,struct pipe_query * query)1152 static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1153 {
1154 struct si_context *sctx = (struct si_context *)ctx;
1155 struct si_query *squery = (struct si_query *)query;
1156
1157 return squery->ops->end(sctx, squery);
1158 }
1159
si_query_hw_end(struct si_context * sctx,struct si_query * squery)1160 bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1161 {
1162 struct si_query_hw *query = (struct si_query_hw *)squery;
1163
1164 if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1165 si_query_buffer_reset(sctx, &query->buffer);
1166
1167 si_query_hw_emit_stop(sctx, query);
1168
1169 if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1170 list_delinit(&query->b.active_list);
1171 sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1172 }
1173
1174 if (!query->buffer.buf)
1175 return false;
1176
1177 return true;
1178 }
1179
si_get_hw_query_params(struct si_context * sctx,struct si_query_hw * squery,int index,struct si_hw_query_params * params)1180 static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
1181 struct si_hw_query_params *params)
1182 {
1183 unsigned max_rbs = sctx->screen->info.num_render_backends;
1184
1185 params->pair_stride = 0;
1186 params->pair_count = 1;
1187
1188 switch (squery->b.type) {
1189 case PIPE_QUERY_OCCLUSION_COUNTER:
1190 case PIPE_QUERY_OCCLUSION_PREDICATE:
1191 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1192 params->start_offset = 0;
1193 params->end_offset = 8;
1194 params->fence_offset = max_rbs * 16;
1195 params->pair_stride = 16;
1196 params->pair_count = max_rbs;
1197 break;
1198 case PIPE_QUERY_TIME_ELAPSED:
1199 params->start_offset = 0;
1200 params->end_offset = 8;
1201 params->fence_offset = 16;
1202 break;
1203 case PIPE_QUERY_TIMESTAMP:
1204 params->start_offset = 0;
1205 params->end_offset = 0;
1206 params->fence_offset = 8;
1207 break;
1208 case PIPE_QUERY_PRIMITIVES_EMITTED:
1209 params->start_offset = 8;
1210 params->end_offset = 24;
1211 params->fence_offset = params->end_offset + 4;
1212 break;
1213 case PIPE_QUERY_PRIMITIVES_GENERATED:
1214 params->start_offset = 0;
1215 params->end_offset = 16;
1216 params->fence_offset = params->end_offset + 4;
1217 break;
1218 case PIPE_QUERY_SO_STATISTICS:
1219 params->start_offset = 8 - index * 8;
1220 params->end_offset = 24 - index * 8;
1221 params->fence_offset = params->end_offset + 4;
1222 break;
1223 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1224 params->pair_count = SI_MAX_STREAMS;
1225 params->pair_stride = 32;
1226 /* fallthrough */
1227 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1228 params->start_offset = 0;
1229 params->end_offset = 16;
1230
1231 /* We can re-use the high dword of the last 64-bit value as a
1232 * fence: it is initialized as 0, and the high bit is set by
1233 * the write of the streamout stats event.
1234 */
1235 params->fence_offset = squery->result_size - 4;
1236 break;
1237 case PIPE_QUERY_PIPELINE_STATISTICS: {
1238 static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
1239 params->start_offset = offsets[index];
1240 params->end_offset = 88 + offsets[index];
1241 params->fence_offset = 2 * 88;
1242 break;
1243 }
1244 default:
1245 unreachable("si_get_hw_query_params unsupported");
1246 }
1247 }
1248
si_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1249 static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1250 bool test_status_bit)
1251 {
1252 uint32_t *current_result = (uint32_t *)map;
1253 uint64_t start, end;
1254
1255 start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1256 end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1257
1258 if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1259 return end - start;
1260 }
1261 return 0;
1262 }
1263
si_query_hw_add_result(struct si_screen * sscreen,struct si_query_hw * query,void * buffer,union pipe_query_result * result)1264 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1265 void *buffer, union pipe_query_result *result)
1266 {
1267 unsigned max_rbs = sscreen->info.num_render_backends;
1268
1269 switch (query->b.type) {
1270 case PIPE_QUERY_OCCLUSION_COUNTER: {
1271 for (unsigned i = 0; i < max_rbs; ++i) {
1272 unsigned results_base = i * 16;
1273 result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1274 }
1275 break;
1276 }
1277 case PIPE_QUERY_OCCLUSION_PREDICATE:
1278 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1279 for (unsigned i = 0; i < max_rbs; ++i) {
1280 unsigned results_base = i * 16;
1281 result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1282 }
1283 break;
1284 }
1285 case PIPE_QUERY_TIME_ELAPSED:
1286 result->u64 += si_query_read_result(buffer, 0, 2, false);
1287 break;
1288 case SI_QUERY_TIME_ELAPSED_SDMA:
1289 result->u64 += si_query_read_result(buffer, 0, 32 / 4, false);
1290 break;
1291 case PIPE_QUERY_TIMESTAMP:
1292 result->u64 = *(uint64_t *)buffer;
1293 break;
1294 case PIPE_QUERY_PRIMITIVES_EMITTED:
1295 /* SAMPLE_STREAMOUTSTATS stores this structure:
1296 * {
1297 * u64 NumPrimitivesWritten;
1298 * u64 PrimitiveStorageNeeded;
1299 * }
1300 * We only need NumPrimitivesWritten here. */
1301 result->u64 += si_query_read_result(buffer, 2, 6, true);
1302 break;
1303 case PIPE_QUERY_PRIMITIVES_GENERATED:
1304 /* Here we read PrimitiveStorageNeeded. */
1305 result->u64 += si_query_read_result(buffer, 0, 4, true);
1306 break;
1307 case PIPE_QUERY_SO_STATISTICS:
1308 result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1309 result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1310 break;
1311 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1312 result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1313 si_query_read_result(buffer, 0, 4, true);
1314 break;
1315 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1316 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1317 result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1318 si_query_read_result(buffer, 0, 4, true);
1319 buffer = (char *)buffer + 32;
1320 }
1321 break;
1322 case PIPE_QUERY_PIPELINE_STATISTICS:
1323 result->pipeline_statistics.ps_invocations += si_query_read_result(buffer, 0, 22, false);
1324 result->pipeline_statistics.c_primitives += si_query_read_result(buffer, 2, 24, false);
1325 result->pipeline_statistics.c_invocations += si_query_read_result(buffer, 4, 26, false);
1326 result->pipeline_statistics.vs_invocations += si_query_read_result(buffer, 6, 28, false);
1327 result->pipeline_statistics.gs_invocations += si_query_read_result(buffer, 8, 30, false);
1328 result->pipeline_statistics.gs_primitives += si_query_read_result(buffer, 10, 32, false);
1329 result->pipeline_statistics.ia_primitives += si_query_read_result(buffer, 12, 34, false);
1330 result->pipeline_statistics.ia_vertices += si_query_read_result(buffer, 14, 36, false);
1331 result->pipeline_statistics.hs_invocations += si_query_read_result(buffer, 16, 38, false);
1332 result->pipeline_statistics.ds_invocations += si_query_read_result(buffer, 18, 40, false);
1333 result->pipeline_statistics.cs_invocations += si_query_read_result(buffer, 20, 42, false);
1334 #if 0 /* for testing */
1335 printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1336 "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1337 "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1338 result->pipeline_statistics.ia_vertices,
1339 result->pipeline_statistics.ia_primitives,
1340 result->pipeline_statistics.vs_invocations,
1341 result->pipeline_statistics.hs_invocations,
1342 result->pipeline_statistics.ds_invocations,
1343 result->pipeline_statistics.gs_invocations,
1344 result->pipeline_statistics.gs_primitives,
1345 result->pipeline_statistics.c_invocations,
1346 result->pipeline_statistics.c_primitives,
1347 result->pipeline_statistics.ps_invocations,
1348 result->pipeline_statistics.cs_invocations);
1349 #endif
1350 break;
1351 default:
1352 assert(0);
1353 }
1354 }
1355
si_query_hw_suspend(struct si_context * sctx,struct si_query * query)1356 void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1357 {
1358 si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1359 }
1360
si_query_hw_resume(struct si_context * sctx,struct si_query * query)1361 void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1362 {
1363 si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1364 }
1365
1366 static const struct si_query_ops query_hw_ops = {
1367 .destroy = si_query_hw_destroy,
1368 .begin = si_query_hw_begin,
1369 .end = si_query_hw_end,
1370 .get_result = si_query_hw_get_result,
1371 .get_result_resource = si_query_hw_get_result_resource,
1372
1373 .suspend = si_query_hw_suspend,
1374 .resume = si_query_hw_resume,
1375 };
1376
si_get_query_result(struct pipe_context * ctx,struct pipe_query * query,bool wait,union pipe_query_result * result)1377 static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1378 union pipe_query_result *result)
1379 {
1380 struct si_context *sctx = (struct si_context *)ctx;
1381 struct si_query *squery = (struct si_query *)query;
1382
1383 return squery->ops->get_result(sctx, squery, wait, result);
1384 }
1385
si_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1386 static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1387 bool wait, enum pipe_query_value_type result_type,
1388 int index, struct pipe_resource *resource, unsigned offset)
1389 {
1390 struct si_context *sctx = (struct si_context *)ctx;
1391 struct si_query *squery = (struct si_query *)query;
1392
1393 squery->ops->get_result_resource(sctx, squery, wait, result_type, index, resource, offset);
1394 }
1395
si_query_hw_clear_result(struct si_query_hw * query,union pipe_query_result * result)1396 static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1397 {
1398 util_query_clear_result(result, query->b.type);
1399 }
1400
si_query_hw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1401 bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1402 union pipe_query_result *result)
1403 {
1404 struct si_screen *sscreen = sctx->screen;
1405 struct si_query_hw *query = (struct si_query_hw *)squery;
1406 struct si_query_buffer *qbuf;
1407
1408 query->ops->clear_result(query, result);
1409
1410 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1411 unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1412 unsigned results_base = 0;
1413 void *map;
1414
1415 if (squery->b.flushed)
1416 map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
1417 else
1418 map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
1419
1420 if (!map)
1421 return false;
1422
1423 while (results_base != qbuf->results_end) {
1424 query->ops->add_result(sscreen, query, map + results_base, result);
1425 results_base += query->result_size;
1426 }
1427 }
1428
1429 /* Convert the time to expected units. */
1430 if (squery->type == PIPE_QUERY_TIME_ELAPSED || squery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
1431 squery->type == PIPE_QUERY_TIMESTAMP) {
1432 result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1433 }
1434 return true;
1435 }
1436
si_query_hw_get_result_resource(struct si_context * sctx,struct si_query * squery,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1437 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1438 bool wait, enum pipe_query_value_type result_type,
1439 int index, struct pipe_resource *resource,
1440 unsigned offset)
1441 {
1442 struct si_query_hw *query = (struct si_query_hw *)squery;
1443 struct si_query_buffer *qbuf;
1444 struct si_query_buffer *qbuf_prev;
1445 struct pipe_resource *tmp_buffer = NULL;
1446 unsigned tmp_buffer_offset = 0;
1447 struct si_qbo_state saved_state = {};
1448 struct pipe_grid_info grid = {};
1449 struct pipe_constant_buffer constant_buffer = {};
1450 struct pipe_shader_buffer ssbo[3];
1451 struct si_hw_query_params params;
1452 struct {
1453 uint32_t end_offset;
1454 uint32_t result_stride;
1455 uint32_t result_count;
1456 uint32_t config;
1457 uint32_t fence_offset;
1458 uint32_t pair_stride;
1459 uint32_t pair_count;
1460 } consts;
1461
1462 if (!sctx->query_result_shader) {
1463 sctx->query_result_shader = si_create_query_result_cs(sctx);
1464 if (!sctx->query_result_shader)
1465 return;
1466 }
1467
1468 if (query->buffer.previous) {
1469 u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1470 if (!tmp_buffer)
1471 return;
1472 }
1473
1474 si_save_qbo_state(sctx, &saved_state);
1475
1476 si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, ¶ms);
1477 consts.end_offset = params.end_offset - params.start_offset;
1478 consts.fence_offset = params.fence_offset - params.start_offset;
1479 consts.result_stride = query->result_size;
1480 consts.pair_stride = params.pair_stride;
1481 consts.pair_count = params.pair_count;
1482
1483 constant_buffer.buffer_size = sizeof(consts);
1484 constant_buffer.user_buffer = &consts;
1485
1486 ssbo[1].buffer = tmp_buffer;
1487 ssbo[1].buffer_offset = tmp_buffer_offset;
1488 ssbo[1].buffer_size = 16;
1489
1490 ssbo[2] = ssbo[1];
1491
1492 sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
1493
1494 grid.block[0] = 1;
1495 grid.block[1] = 1;
1496 grid.block[2] = 1;
1497 grid.grid[0] = 1;
1498 grid.grid[1] = 1;
1499 grid.grid[2] = 1;
1500
1501 consts.config = 0;
1502 if (index < 0)
1503 consts.config |= 4;
1504 if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1505 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1506 consts.config |= 8;
1507 else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1508 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1509 consts.config |= 8 | 256;
1510 else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1511 consts.config |= 32;
1512
1513 switch (result_type) {
1514 case PIPE_QUERY_TYPE_U64:
1515 case PIPE_QUERY_TYPE_I64:
1516 consts.config |= 64;
1517 break;
1518 case PIPE_QUERY_TYPE_I32:
1519 consts.config |= 128;
1520 break;
1521 case PIPE_QUERY_TYPE_U32:
1522 break;
1523 }
1524
1525 sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
1526
1527 for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1528 if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1529 qbuf_prev = qbuf->previous;
1530 consts.result_count = qbuf->results_end / query->result_size;
1531 consts.config &= ~3;
1532 if (qbuf != &query->buffer)
1533 consts.config |= 1;
1534 if (qbuf->previous)
1535 consts.config |= 2;
1536 } else {
1537 /* Only read the last timestamp. */
1538 qbuf_prev = NULL;
1539 consts.result_count = 0;
1540 consts.config |= 16;
1541 params.start_offset += qbuf->results_end - query->result_size;
1542 }
1543
1544 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
1545
1546 ssbo[0].buffer = &qbuf->buf->b.b;
1547 ssbo[0].buffer_offset = params.start_offset;
1548 ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1549
1550 if (!qbuf->previous) {
1551 ssbo[2].buffer = resource;
1552 ssbo[2].buffer_offset = offset;
1553 ssbo[2].buffer_size = 8;
1554
1555 si_resource(resource)->TC_L2_dirty = true;
1556 }
1557
1558 sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 1 << 2);
1559
1560 if (wait && qbuf == &query->buffer) {
1561 uint64_t va;
1562
1563 /* Wait for result availability. Wait only for readiness
1564 * of the last entry, since the fence writes should be
1565 * serialized in the CP.
1566 */
1567 va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1568 va += params.fence_offset;
1569
1570 si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1571 }
1572
1573 sctx->b.launch_grid(&sctx->b, &grid);
1574 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
1575 }
1576
1577 si_restore_qbo_state(sctx, &saved_state);
1578 pipe_resource_reference(&tmp_buffer, NULL);
1579 }
1580
si_render_condition(struct pipe_context * ctx,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1581 static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1582 enum pipe_render_cond_flag mode)
1583 {
1584 struct si_context *sctx = (struct si_context *)ctx;
1585 struct si_query_hw *squery = (struct si_query_hw *)query;
1586 struct si_atom *atom = &sctx->atoms.s.render_cond;
1587
1588 if (query) {
1589 bool needs_workaround = false;
1590
1591 /* There was a firmware regression in GFX8 which causes successive
1592 * SET_PREDICATION packets to give the wrong answer for
1593 * non-inverted stream overflow predication.
1594 */
1595 if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1596 (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1597 !condition &&
1598 (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1599 (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1600 (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1601 needs_workaround = true;
1602 }
1603
1604 if (needs_workaround && !squery->workaround_buf) {
1605 bool old_force_off = sctx->render_cond_force_off;
1606 sctx->render_cond_force_off = true;
1607
1608 u_suballocator_alloc(sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1609 (struct pipe_resource **)&squery->workaround_buf);
1610
1611 /* Reset to NULL to avoid a redundant SET_PREDICATION
1612 * from launching the compute grid.
1613 */
1614 sctx->render_cond = NULL;
1615
1616 ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1617 &squery->workaround_buf->b.b, squery->workaround_offset);
1618
1619 /* Settings this in the render cond atom is too late,
1620 * so set it here. */
1621 sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
1622
1623 sctx->render_cond_force_off = old_force_off;
1624 }
1625 }
1626
1627 sctx->render_cond = query;
1628 sctx->render_cond_invert = condition;
1629 sctx->render_cond_mode = mode;
1630
1631 si_set_atom_dirty(sctx, atom, query != NULL);
1632 }
1633
si_suspend_queries(struct si_context * sctx)1634 void si_suspend_queries(struct si_context *sctx)
1635 {
1636 struct si_query *query;
1637
1638 LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1639 query->ops->suspend(sctx, query);
1640 }
1641
si_resume_queries(struct si_context * sctx)1642 void si_resume_queries(struct si_context *sctx)
1643 {
1644 struct si_query *query;
1645
1646 /* Check CS space here. Resuming must not be interrupted by flushes. */
1647 si_need_gfx_cs_space(sctx, 0);
1648
1649 LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1650 query->ops->resume(sctx, query);
1651 }
1652
1653 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
1654 { \
1655 .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1656 .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_ \
1657 }
1658
1659 #define X(name_, query_type_, type_, result_type_) \
1660 XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1661
1662 #define XG(group_, name_, query_type_, type_, result_type_) \
1663 XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1664
1665 static struct pipe_driver_query_info si_driver_query_list[] = {
1666 X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1667 X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1668 X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1669 X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1670 X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),
1671 X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
1672 X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
1673 X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1674 X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
1675 X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
1676 X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1677 X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1678 X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1679 X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1680 X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1681 X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1682 X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1683 X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1684 X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1685 X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1686 X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1687 X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1688 X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1689 X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1690 X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1691 X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1692 X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1693 X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1694 X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1695 X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1696 X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1697 X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
1698 X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1699 X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1700 X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1701 X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1702 X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1703 X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1704 X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1705 X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1706 X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1707 X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1708 X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1709 X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1710 X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1711 X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1712 X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1713
1714 /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1715 * which use it as a fallback path to detect the GPU type.
1716 *
1717 * Note: The names of these queries are significant for GPUPerfStudio
1718 * (and possibly their order as well). */
1719 XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1720 XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1721 XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1722 XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1723 XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1724
1725 X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1726 X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1727 X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1728
1729 /* The following queries must be at the end of the list because their
1730 * availability is adjusted dynamically based on the DRM version. */
1731 X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1732 X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1733 X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1734 X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1735 X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1736 X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1737 X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1738 X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1739 X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1740 X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1741 X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1742 X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1743 X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1744 X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1745
1746 /* SRBM_STATUS2 */
1747 X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1748
1749 /* CP_STAT */
1750 X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1751 X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1752 X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1753 X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1754 X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1755 X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1756
1757 X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
1758 X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
1759 X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
1760 };
1761
1762 #undef X
1763 #undef XG
1764 #undef XFULL
1765
si_get_num_queries(struct si_screen * sscreen)1766 static unsigned si_get_num_queries(struct si_screen *sscreen)
1767 {
1768 /* amdgpu */
1769 if (sscreen->info.is_amdgpu) {
1770 if (sscreen->info.chip_class >= GFX8)
1771 return ARRAY_SIZE(si_driver_query_list);
1772 else
1773 return ARRAY_SIZE(si_driver_query_list) - 7;
1774 }
1775
1776 /* radeon */
1777 if (sscreen->info.has_read_registers_query) {
1778 if (sscreen->info.chip_class == GFX7)
1779 return ARRAY_SIZE(si_driver_query_list) - 6;
1780 else
1781 return ARRAY_SIZE(si_driver_query_list) - 7;
1782 }
1783
1784 return ARRAY_SIZE(si_driver_query_list) - 21;
1785 }
1786
si_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)1787 static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1788 struct pipe_driver_query_info *info)
1789 {
1790 struct si_screen *sscreen = (struct si_screen *)screen;
1791 unsigned num_queries = si_get_num_queries(sscreen);
1792
1793 if (!info) {
1794 unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1795
1796 return num_queries + num_perfcounters;
1797 }
1798
1799 if (index >= num_queries)
1800 return si_get_perfcounter_info(sscreen, index - num_queries, info);
1801
1802 *info = si_driver_query_list[index];
1803
1804 switch (info->query_type) {
1805 case SI_QUERY_REQUESTED_VRAM:
1806 case SI_QUERY_VRAM_USAGE:
1807 case SI_QUERY_MAPPED_VRAM:
1808 info->max_value.u64 = sscreen->info.vram_size;
1809 break;
1810 case SI_QUERY_REQUESTED_GTT:
1811 case SI_QUERY_GTT_USAGE:
1812 case SI_QUERY_MAPPED_GTT:
1813 info->max_value.u64 = sscreen->info.gart_size;
1814 break;
1815 case SI_QUERY_GPU_TEMPERATURE:
1816 info->max_value.u64 = 125;
1817 break;
1818 case SI_QUERY_VRAM_VIS_USAGE:
1819 info->max_value.u64 = sscreen->info.vram_vis_size;
1820 break;
1821 }
1822
1823 if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1824 info->group_id += sscreen->perfcounters->num_groups;
1825
1826 return 1;
1827 }
1828
1829 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1830 * performance counter groups, so be careful when changing this and related
1831 * functions.
1832 */
si_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1833 static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1834 struct pipe_driver_query_group_info *info)
1835 {
1836 struct si_screen *sscreen = (struct si_screen *)screen;
1837 unsigned num_pc_groups = 0;
1838
1839 if (sscreen->perfcounters)
1840 num_pc_groups = sscreen->perfcounters->num_groups;
1841
1842 if (!info)
1843 return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1844
1845 if (index < num_pc_groups)
1846 return si_get_perfcounter_group_info(sscreen, index, info);
1847
1848 index -= num_pc_groups;
1849 if (index >= SI_NUM_SW_QUERY_GROUPS)
1850 return 0;
1851
1852 info->name = "GPIN";
1853 info->max_active_queries = 5;
1854 info->num_queries = 5;
1855 return 1;
1856 }
1857
si_init_query_functions(struct si_context * sctx)1858 void si_init_query_functions(struct si_context *sctx)
1859 {
1860 sctx->b.create_query = si_create_query;
1861 sctx->b.create_batch_query = si_create_batch_query;
1862 sctx->b.destroy_query = si_destroy_query;
1863 sctx->b.begin_query = si_begin_query;
1864 sctx->b.end_query = si_end_query;
1865 sctx->b.get_query_result = si_get_query_result;
1866 sctx->b.get_query_result_resource = si_get_query_result_resource;
1867
1868 if (sctx->has_graphics) {
1869 sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1870 sctx->b.render_condition = si_render_condition;
1871 }
1872
1873 list_inithead(&sctx->active_queries);
1874 }
1875
si_init_screen_query_functions(struct si_screen * sscreen)1876 void si_init_screen_query_functions(struct si_screen *sscreen)
1877 {
1878 sscreen->b.get_driver_query_info = si_get_driver_query_info;
1879 sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1880 }
1881