• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4  * Copyright 2018 Advanced Micro Devices, Inc.
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "si_query.h"
10 #include "si_build_pm4.h"
11 
12 #include "amd/common/sid.h"
13 #include "si_pipe.h"
14 #include "util/os_time.h"
15 #include "util/u_memory.h"
16 #include "util/u_suballoc.h"
17 #include "util/u_upload_mgr.h"
18 
19 static const struct si_query_ops query_hw_ops;
20 static const struct si_query_hw_ops query_hw_default_hw_ops;
21 static const struct si_query_ops sw_query_ops;
22 
23 struct si_hw_query_params {
24    unsigned start_offset;
25    unsigned end_offset;
26    unsigned fence_offset;
27    unsigned pair_stride;
28    unsigned pair_count;
29 };
30 
31 /* Queries without buffer handling or suspend/resume. */
32 struct si_query_sw {
33    struct si_query b;
34 
35    uint64_t begin_result;
36    uint64_t end_result;
37 
38    uint64_t begin_time;
39    uint64_t end_time;
40 
41    /* Fence for GPU_FINISHED. */
42    struct pipe_fence_handle *fence;
43 };
44 
si_query_sw_destroy(struct si_context * sctx,struct si_query * squery)45 static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
46 {
47    struct si_query_sw *query = (struct si_query_sw *)squery;
48 
49    sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
50    FREE(query);
51 }
52 
winsys_id_from_type(unsigned type)53 static enum radeon_value_id winsys_id_from_type(unsigned type)
54 {
55    switch (type) {
56    case SI_QUERY_REQUESTED_VRAM:
57       return RADEON_REQUESTED_VRAM_MEMORY;
58    case SI_QUERY_REQUESTED_GTT:
59       return RADEON_REQUESTED_GTT_MEMORY;
60    case SI_QUERY_MAPPED_VRAM:
61       return RADEON_MAPPED_VRAM;
62    case SI_QUERY_MAPPED_GTT:
63       return RADEON_MAPPED_GTT;
64    case SI_QUERY_SLAB_WASTED_VRAM:
65       return RADEON_SLAB_WASTED_VRAM;
66    case SI_QUERY_SLAB_WASTED_GTT:
67       return RADEON_SLAB_WASTED_GTT;
68    case SI_QUERY_BUFFER_WAIT_TIME:
69       return RADEON_BUFFER_WAIT_TIME_NS;
70    case SI_QUERY_NUM_MAPPED_BUFFERS:
71       return RADEON_NUM_MAPPED_BUFFERS;
72    case SI_QUERY_NUM_GFX_IBS:
73       return RADEON_NUM_GFX_IBS;
74    case SI_QUERY_GFX_BO_LIST_SIZE:
75       return RADEON_GFX_BO_LIST_COUNTER;
76    case SI_QUERY_GFX_IB_SIZE:
77       return RADEON_GFX_IB_SIZE_COUNTER;
78    case SI_QUERY_NUM_BYTES_MOVED:
79       return RADEON_NUM_BYTES_MOVED;
80    case SI_QUERY_NUM_EVICTIONS:
81       return RADEON_NUM_EVICTIONS;
82    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
83       return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
84    case SI_QUERY_VRAM_USAGE:
85       return RADEON_VRAM_USAGE;
86    case SI_QUERY_VRAM_VIS_USAGE:
87       return RADEON_VRAM_VIS_USAGE;
88    case SI_QUERY_GTT_USAGE:
89       return RADEON_GTT_USAGE;
90    case SI_QUERY_GPU_TEMPERATURE:
91       return RADEON_GPU_TEMPERATURE;
92    case SI_QUERY_CURRENT_GPU_SCLK:
93       return RADEON_CURRENT_SCLK;
94    case SI_QUERY_CURRENT_GPU_MCLK:
95       return RADEON_CURRENT_MCLK;
96    case SI_QUERY_CS_THREAD_BUSY:
97       return RADEON_CS_THREAD_TIME;
98    default:
99       unreachable("query type does not correspond to winsys id");
100    }
101 }
102 
si_query_sw_begin(struct si_context * sctx,struct si_query * squery)103 static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
104 {
105    struct si_query_sw *query = (struct si_query_sw *)squery;
106    enum radeon_value_id ws_id;
107 
108    switch (query->b.type) {
109    case PIPE_QUERY_TIMESTAMP_DISJOINT:
110    case PIPE_QUERY_GPU_FINISHED:
111       break;
112    case SI_QUERY_DRAW_CALLS:
113       query->begin_result = sctx->num_draw_calls;
114       break;
115    case SI_QUERY_DECOMPRESS_CALLS:
116       query->begin_result = sctx->num_decompress_calls;
117       break;
118    case SI_QUERY_COMPUTE_CALLS:
119       query->begin_result = sctx->num_compute_calls;
120       break;
121    case SI_QUERY_CP_DMA_CALLS:
122       query->begin_result = sctx->num_cp_dma_calls;
123       break;
124    case SI_QUERY_NUM_VS_FLUSHES:
125       query->begin_result = sctx->num_vs_flushes;
126       break;
127    case SI_QUERY_NUM_PS_FLUSHES:
128       query->begin_result = sctx->num_ps_flushes;
129       break;
130    case SI_QUERY_NUM_CS_FLUSHES:
131       query->begin_result = sctx->num_cs_flushes;
132       break;
133    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
134       query->begin_result = sctx->num_cb_cache_flushes;
135       break;
136    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
137       query->begin_result = sctx->num_db_cache_flushes;
138       break;
139    case SI_QUERY_NUM_L2_INVALIDATES:
140       query->begin_result = sctx->num_L2_invalidates;
141       break;
142    case SI_QUERY_NUM_L2_WRITEBACKS:
143       query->begin_result = sctx->num_L2_writebacks;
144       break;
145    case SI_QUERY_NUM_RESIDENT_HANDLES:
146       query->begin_result = sctx->num_resident_handles;
147       break;
148    case SI_QUERY_TC_OFFLOADED_SLOTS:
149       query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
150       break;
151    case SI_QUERY_TC_DIRECT_SLOTS:
152       query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
153       break;
154    case SI_QUERY_TC_NUM_SYNCS:
155       query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
156       break;
157    case SI_QUERY_REQUESTED_VRAM:
158    case SI_QUERY_REQUESTED_GTT:
159    case SI_QUERY_MAPPED_VRAM:
160    case SI_QUERY_MAPPED_GTT:
161    case SI_QUERY_SLAB_WASTED_VRAM:
162    case SI_QUERY_SLAB_WASTED_GTT:
163    case SI_QUERY_VRAM_USAGE:
164    case SI_QUERY_VRAM_VIS_USAGE:
165    case SI_QUERY_GTT_USAGE:
166    case SI_QUERY_GPU_TEMPERATURE:
167    case SI_QUERY_CURRENT_GPU_SCLK:
168    case SI_QUERY_CURRENT_GPU_MCLK:
169    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
170    case SI_QUERY_NUM_MAPPED_BUFFERS:
171       query->begin_result = 0;
172       break;
173    case SI_QUERY_BUFFER_WAIT_TIME:
174    case SI_QUERY_GFX_IB_SIZE:
175    case SI_QUERY_NUM_GFX_IBS:
176    case SI_QUERY_NUM_BYTES_MOVED:
177    case SI_QUERY_NUM_EVICTIONS:
178    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
179       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
180       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
181       break;
182    }
183    case SI_QUERY_GFX_BO_LIST_SIZE:
184       ws_id = winsys_id_from_type(query->b.type);
185       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
186       query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
187       break;
188    case SI_QUERY_CS_THREAD_BUSY:
189       ws_id = winsys_id_from_type(query->b.type);
190       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
191       query->begin_time = os_time_get_nano();
192       break;
193    case SI_QUERY_GALLIUM_THREAD_BUSY:
194       query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
195       query->begin_time = os_time_get_nano();
196       break;
197    case SI_QUERY_GPU_LOAD:
198    case SI_QUERY_GPU_SHADERS_BUSY:
199    case SI_QUERY_GPU_TA_BUSY:
200    case SI_QUERY_GPU_GDS_BUSY:
201    case SI_QUERY_GPU_VGT_BUSY:
202    case SI_QUERY_GPU_IA_BUSY:
203    case SI_QUERY_GPU_SX_BUSY:
204    case SI_QUERY_GPU_WD_BUSY:
205    case SI_QUERY_GPU_BCI_BUSY:
206    case SI_QUERY_GPU_SC_BUSY:
207    case SI_QUERY_GPU_PA_BUSY:
208    case SI_QUERY_GPU_DB_BUSY:
209    case SI_QUERY_GPU_CP_BUSY:
210    case SI_QUERY_GPU_CB_BUSY:
211    case SI_QUERY_GPU_SDMA_BUSY:
212    case SI_QUERY_GPU_PFP_BUSY:
213    case SI_QUERY_GPU_MEQ_BUSY:
214    case SI_QUERY_GPU_ME_BUSY:
215    case SI_QUERY_GPU_SURF_SYNC_BUSY:
216    case SI_QUERY_GPU_CP_DMA_BUSY:
217    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
218       query->begin_result = si_begin_counter(sctx->screen, query->b.type);
219       break;
220    case SI_QUERY_NUM_COMPILATIONS:
221       query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
222       break;
223    case SI_QUERY_NUM_SHADERS_CREATED:
224       query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
225       break;
226    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
227       query->begin_result = sctx->screen->live_shader_cache.hits;
228       break;
229    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
230       query->begin_result = sctx->screen->live_shader_cache.misses;
231       break;
232    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
233       query->begin_result = sctx->screen->num_memory_shader_cache_hits;
234       break;
235    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
236       query->begin_result = sctx->screen->num_memory_shader_cache_misses;
237       break;
238    case SI_QUERY_DISK_SHADER_CACHE_HITS:
239       query->begin_result = sctx->screen->num_disk_shader_cache_hits;
240       break;
241    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
242       query->begin_result = sctx->screen->num_disk_shader_cache_misses;
243       break;
244    case SI_QUERY_GPIN_ASIC_ID:
245    case SI_QUERY_GPIN_NUM_SIMD:
246    case SI_QUERY_GPIN_NUM_RB:
247    case SI_QUERY_GPIN_NUM_SPI:
248    case SI_QUERY_GPIN_NUM_SE:
249       break;
250    default:
251       unreachable("si_query_sw_begin: bad query type");
252    }
253 
254    return true;
255 }
256 
si_query_sw_end(struct si_context * sctx,struct si_query * squery)257 static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
258 {
259    struct si_query_sw *query = (struct si_query_sw *)squery;
260    enum radeon_value_id ws_id;
261 
262    switch (query->b.type) {
263    case PIPE_QUERY_TIMESTAMP_DISJOINT:
264       break;
265    case PIPE_QUERY_GPU_FINISHED:
266       sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
267       break;
268    case SI_QUERY_DRAW_CALLS:
269       query->end_result = sctx->num_draw_calls;
270       break;
271    case SI_QUERY_DECOMPRESS_CALLS:
272       query->end_result = sctx->num_decompress_calls;
273       break;
274    case SI_QUERY_COMPUTE_CALLS:
275       query->end_result = sctx->num_compute_calls;
276       break;
277    case SI_QUERY_CP_DMA_CALLS:
278       query->end_result = sctx->num_cp_dma_calls;
279       break;
280    case SI_QUERY_NUM_VS_FLUSHES:
281       query->end_result = sctx->num_vs_flushes;
282       break;
283    case SI_QUERY_NUM_PS_FLUSHES:
284       query->end_result = sctx->num_ps_flushes;
285       break;
286    case SI_QUERY_NUM_CS_FLUSHES:
287       query->end_result = sctx->num_cs_flushes;
288       break;
289    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
290       query->end_result = sctx->num_cb_cache_flushes;
291       break;
292    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
293       query->end_result = sctx->num_db_cache_flushes;
294       break;
295    case SI_QUERY_NUM_L2_INVALIDATES:
296       query->end_result = sctx->num_L2_invalidates;
297       break;
298    case SI_QUERY_NUM_L2_WRITEBACKS:
299       query->end_result = sctx->num_L2_writebacks;
300       break;
301    case SI_QUERY_NUM_RESIDENT_HANDLES:
302       query->end_result = sctx->num_resident_handles;
303       break;
304    case SI_QUERY_TC_OFFLOADED_SLOTS:
305       query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
306       break;
307    case SI_QUERY_TC_DIRECT_SLOTS:
308       query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
309       break;
310    case SI_QUERY_TC_NUM_SYNCS:
311       query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
312       break;
313    case SI_QUERY_REQUESTED_VRAM:
314    case SI_QUERY_REQUESTED_GTT:
315    case SI_QUERY_MAPPED_VRAM:
316    case SI_QUERY_MAPPED_GTT:
317    case SI_QUERY_SLAB_WASTED_VRAM:
318    case SI_QUERY_SLAB_WASTED_GTT:
319    case SI_QUERY_VRAM_USAGE:
320    case SI_QUERY_VRAM_VIS_USAGE:
321    case SI_QUERY_GTT_USAGE:
322    case SI_QUERY_GPU_TEMPERATURE:
323    case SI_QUERY_CURRENT_GPU_SCLK:
324    case SI_QUERY_CURRENT_GPU_MCLK:
325    case SI_QUERY_BUFFER_WAIT_TIME:
326    case SI_QUERY_GFX_IB_SIZE:
327    case SI_QUERY_NUM_MAPPED_BUFFERS:
328    case SI_QUERY_NUM_GFX_IBS:
329    case SI_QUERY_NUM_BYTES_MOVED:
330    case SI_QUERY_NUM_EVICTIONS:
331    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
332       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
333       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
334       break;
335    }
336    case SI_QUERY_GFX_BO_LIST_SIZE:
337       ws_id = winsys_id_from_type(query->b.type);
338       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
339       query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
340       break;
341    case SI_QUERY_CS_THREAD_BUSY:
342       ws_id = winsys_id_from_type(query->b.type);
343       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
344       query->end_time = os_time_get_nano();
345       break;
346    case SI_QUERY_GALLIUM_THREAD_BUSY:
347       query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
348       query->end_time = os_time_get_nano();
349       break;
350    case SI_QUERY_GPU_LOAD:
351    case SI_QUERY_GPU_SHADERS_BUSY:
352    case SI_QUERY_GPU_TA_BUSY:
353    case SI_QUERY_GPU_GDS_BUSY:
354    case SI_QUERY_GPU_VGT_BUSY:
355    case SI_QUERY_GPU_IA_BUSY:
356    case SI_QUERY_GPU_SX_BUSY:
357    case SI_QUERY_GPU_WD_BUSY:
358    case SI_QUERY_GPU_BCI_BUSY:
359    case SI_QUERY_GPU_SC_BUSY:
360    case SI_QUERY_GPU_PA_BUSY:
361    case SI_QUERY_GPU_DB_BUSY:
362    case SI_QUERY_GPU_CP_BUSY:
363    case SI_QUERY_GPU_CB_BUSY:
364    case SI_QUERY_GPU_SDMA_BUSY:
365    case SI_QUERY_GPU_PFP_BUSY:
366    case SI_QUERY_GPU_MEQ_BUSY:
367    case SI_QUERY_GPU_ME_BUSY:
368    case SI_QUERY_GPU_SURF_SYNC_BUSY:
369    case SI_QUERY_GPU_CP_DMA_BUSY:
370    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
371       query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
372       query->begin_result = 0;
373       break;
374    case SI_QUERY_NUM_COMPILATIONS:
375       query->end_result = p_atomic_read(&sctx->screen->num_compilations);
376       break;
377    case SI_QUERY_NUM_SHADERS_CREATED:
378       query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
379       break;
380    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
381       query->end_result = sctx->last_tex_ps_draw_ratio;
382       break;
383    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
384       query->end_result = sctx->screen->live_shader_cache.hits;
385       break;
386    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
387       query->end_result = sctx->screen->live_shader_cache.misses;
388       break;
389    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
390       query->end_result = sctx->screen->num_memory_shader_cache_hits;
391       break;
392    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
393       query->end_result = sctx->screen->num_memory_shader_cache_misses;
394       break;
395    case SI_QUERY_DISK_SHADER_CACHE_HITS:
396       query->end_result = sctx->screen->num_disk_shader_cache_hits;
397       break;
398    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
399       query->end_result = sctx->screen->num_disk_shader_cache_misses;
400       break;
401    case SI_QUERY_GPIN_ASIC_ID:
402    case SI_QUERY_GPIN_NUM_SIMD:
403    case SI_QUERY_GPIN_NUM_RB:
404    case SI_QUERY_GPIN_NUM_SPI:
405    case SI_QUERY_GPIN_NUM_SE:
406       break;
407    default:
408       unreachable("si_query_sw_end: bad query type");
409    }
410 
411    return true;
412 }
413 
si_query_sw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)414 static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
415                                    union pipe_query_result *result)
416 {
417    struct si_query_sw *query = (struct si_query_sw *)squery;
418 
419    switch (query->b.type) {
420    case PIPE_QUERY_TIMESTAMP_DISJOINT:
421       /* Convert from cycles per millisecond to cycles per second (Hz). */
422       result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
423       result->timestamp_disjoint.disjoint = false;
424       return true;
425    case PIPE_QUERY_GPU_FINISHED: {
426       struct pipe_screen *screen = sctx->b.screen;
427       struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
428 
429       result->b = screen->fence_finish(screen, ctx, query->fence, wait ? OS_TIMEOUT_INFINITE : 0);
430       return result->b;
431    }
432 
433    case SI_QUERY_GFX_BO_LIST_SIZE:
434       result->u64 =
435          (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
436       return true;
437    case SI_QUERY_CS_THREAD_BUSY:
438    case SI_QUERY_GALLIUM_THREAD_BUSY:
439       result->u64 =
440          (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
441       return true;
442    case SI_QUERY_GPIN_ASIC_ID:
443       result->u32 = 0;
444       return true;
445    case SI_QUERY_GPIN_NUM_SIMD:
446       result->u32 = sctx->screen->info.num_cu;
447       return true;
448    case SI_QUERY_GPIN_NUM_RB:
449       result->u32 = sctx->screen->info.max_render_backends;
450       return true;
451    case SI_QUERY_GPIN_NUM_SPI:
452       result->u32 = 1; /* all supported chips have one SPI per SE */
453       return true;
454    case SI_QUERY_GPIN_NUM_SE:
455       result->u32 = sctx->screen->info.max_se;
456       return true;
457    }
458 
459    result->u64 = query->end_result - query->begin_result;
460 
461    switch (query->b.type) {
462    case SI_QUERY_BUFFER_WAIT_TIME:
463    case SI_QUERY_GPU_TEMPERATURE:
464       result->u64 /= 1000;
465       break;
466    case SI_QUERY_CURRENT_GPU_SCLK:
467    case SI_QUERY_CURRENT_GPU_MCLK:
468       result->u64 *= 1000000;
469       break;
470    }
471 
472    return true;
473 }
474 
si_query_sw_create(unsigned query_type)475 static struct pipe_query *si_query_sw_create(unsigned query_type)
476 {
477    struct si_query_sw *query;
478 
479    query = CALLOC_STRUCT(si_query_sw);
480    if (!query)
481       return NULL;
482 
483    query->b.type = query_type;
484    query->b.ops = &sw_query_ops;
485 
486    return (struct pipe_query *)query;
487 }
488 
si_query_buffer_destroy(struct si_screen * sscreen,struct si_query_buffer * buffer)489 void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
490 {
491    struct si_query_buffer *prev = buffer->previous;
492 
493    /* Release all query buffers. */
494    while (prev) {
495       struct si_query_buffer *qbuf = prev;
496       prev = prev->previous;
497       si_resource_reference(&qbuf->buf, NULL);
498       FREE(qbuf);
499    }
500 
501    si_resource_reference(&buffer->buf, NULL);
502 }
503 
si_query_buffer_reset(struct si_context * sctx,struct si_query_buffer * buffer)504 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
505 {
506    /* Discard all query buffers except for the oldest. */
507    while (buffer->previous) {
508       struct si_query_buffer *qbuf = buffer->previous;
509       buffer->previous = qbuf->previous;
510 
511       si_resource_reference(&buffer->buf, NULL);
512       buffer->buf = qbuf->buf; /* move ownership */
513       FREE(qbuf);
514    }
515    buffer->results_end = 0;
516 
517    if (!buffer->buf)
518       return;
519 
520    /* Discard even the oldest buffer if it can't be mapped without a stall. */
521    if (si_cs_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
522        !sctx->ws->buffer_wait(sctx->ws, buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
523       si_resource_reference(&buffer->buf, NULL);
524    } else {
525       buffer->unprepared = true;
526    }
527 }
528 
si_query_buffer_alloc(struct si_context * sctx,struct si_query_buffer * buffer,bool (* prepare_buffer)(struct si_context *,struct si_query_buffer *),unsigned size)529 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
530                            bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
531                            unsigned size)
532 {
533    bool unprepared = buffer->unprepared;
534    buffer->unprepared = false;
535 
536    if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
537       if (buffer->buf) {
538          struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
539          memcpy(qbuf, buffer, sizeof(*qbuf));
540          buffer->previous = qbuf;
541       }
542       buffer->results_end = 0;
543 
544       /* Queries are normally read by the CPU after
545        * being written by the gpu, hence staging is probably a good
546        * usage pattern.
547        */
548       struct si_screen *screen = sctx->screen;
549       unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
550       buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
551       if (unlikely(!buffer->buf))
552          return false;
553       unprepared = true;
554    }
555 
556    if (unprepared && prepare_buffer) {
557       if (unlikely(!prepare_buffer(sctx, buffer))) {
558          si_resource_reference(&buffer->buf, NULL);
559          return false;
560       }
561    }
562 
563    return true;
564 }
565 
si_query_hw_destroy(struct si_context * sctx,struct si_query * squery)566 static void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
567 {
568    struct si_query_hw *query = (struct si_query_hw *)squery;
569 
570    si_query_buffer_destroy(sctx->screen, &query->buffer);
571    si_resource_reference(&query->workaround_buf, NULL);
572    FREE(squery);
573 }
574 
si_query_hw_prepare_buffer(struct si_context * sctx,struct si_query_buffer * qbuf)575 static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
576 {
577    struct si_query_hw *query = container_of(qbuf, struct si_query_hw, buffer);
578    struct si_screen *screen = sctx->screen;
579 
580    /* The caller ensures that the buffer is currently unused by the GPU. */
581    uint32_t *results = screen->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
582                                               PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
583    if (!results)
584       return false;
585 
586    memset(results, 0, qbuf->buf->b.b.width0);
587 
588    if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
589        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
590        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
591       unsigned max_rbs = screen->info.max_render_backends;
592       uint64_t enabled_rb_mask = screen->info.enabled_rb_mask;
593       unsigned num_results;
594       unsigned i, j;
595 
596       /* Set top bits for unused backends. */
597       num_results = qbuf->buf->b.b.width0 / query->result_size;
598       for (j = 0; j < num_results; j++) {
599          for (i = 0; i < max_rbs; i++) {
600             if (!(enabled_rb_mask & (1ull << i))) {
601                results[(i * 4) + 1] = 0x80000000;
602                results[(i * 4) + 3] = 0x80000000;
603             }
604          }
605          results += 4 * max_rbs;
606       }
607    }
608 
609    return true;
610 }
611 
si_query_pipestats_num_results(struct si_screen * sscreen)612 static unsigned si_query_pipestats_num_results(struct si_screen *sscreen)
613 {
614    return sscreen->info.gfx_level >= GFX11 ? 14 : 11;
615 }
616 
si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)617 static unsigned si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)
618 {
619    switch (index) {
620    case PIPE_STAT_QUERY_PS_INVOCATIONS: return 0;
621    case PIPE_STAT_QUERY_C_PRIMITIVES: return 2;
622    case PIPE_STAT_QUERY_C_INVOCATIONS: return 4;
623    case PIPE_STAT_QUERY_VS_INVOCATIONS: return 6;
624    case PIPE_STAT_QUERY_GS_INVOCATIONS: return 8;
625    case PIPE_STAT_QUERY_GS_PRIMITIVES: return 10;
626    case PIPE_STAT_QUERY_IA_PRIMITIVES: return 12;
627    case PIPE_STAT_QUERY_IA_VERTICES: return 14;
628    case PIPE_STAT_QUERY_HS_INVOCATIONS: return 16;
629    case PIPE_STAT_QUERY_DS_INVOCATIONS: return 18;
630    case PIPE_STAT_QUERY_CS_INVOCATIONS: return 20;
631    /* gfx11: MS_INVOCATIONS */
632    /* gfx11: MS_PRIMITIVES */
633    /* gfx11: TS_INVOCATIONS */
634    default:
635       assert(false);
636    }
637    return ~0;
638 }
639 
si_query_pipestat_end_dw_offset(struct si_screen * sscreen,enum pipe_statistics_query_index index)640 unsigned si_query_pipestat_end_dw_offset(struct si_screen *sscreen,
641                                          enum pipe_statistics_query_index index)
642 {
643    return si_query_pipestats_num_results(sscreen) * 2 + si_query_pipestat_dw_offset(index);
644 }
645 
si_query_hw_create(struct si_screen * sscreen,unsigned query_type,unsigned index)646 static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
647                                              unsigned index)
648 {
649    struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
650    if (!query)
651       return NULL;
652 
653    query->b.type = query_type;
654    query->b.ops = &query_hw_ops;
655    query->ops = &query_hw_default_hw_ops;
656 
657    switch (query_type) {
658    case PIPE_QUERY_OCCLUSION_COUNTER:
659    case PIPE_QUERY_OCCLUSION_PREDICATE:
660    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
661       query->result_size = 16 * sscreen->info.max_render_backends;
662       query->result_size += 16; /* for the fence + alignment */
663       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
664       break;
665    case PIPE_QUERY_TIME_ELAPSED:
666       query->result_size = 24;
667       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
668       break;
669    case PIPE_QUERY_TIMESTAMP:
670       query->result_size = 16;
671       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
672       query->flags = SI_QUERY_HW_FLAG_NO_START;
673       break;
674    case PIPE_QUERY_PRIMITIVES_EMITTED:
675    case PIPE_QUERY_PRIMITIVES_GENERATED:
676    case PIPE_QUERY_SO_STATISTICS:
677    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
678       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
679       query->result_size = 32;
680       query->b.num_cs_dw_suspend = 6;
681       query->stream = index;
682       break;
683    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
684       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
685       query->result_size = 32 * SI_MAX_STREAMS;
686       query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
687       break;
688    case PIPE_QUERY_PIPELINE_STATISTICS:
689       query->result_size = si_query_pipestats_num_results(sscreen) * 16;
690       query->result_size += 8; /* for the fence + alignment */
691       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
692       query->index = index;
693       if ((index == PIPE_STAT_QUERY_GS_PRIMITIVES || index == PIPE_STAT_QUERY_GS_INVOCATIONS) &&
694           sscreen->use_ngg && (sscreen->info.gfx_level >= GFX10 && sscreen->info.gfx_level <= GFX10_3))
695          query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
696 
697       /* GFX11 only emulates PIPE_STAT_QUERY_GS_PRIMITIVES because the shader culls,
698        * which makes the statistic incorrect.
699        */
700       if (sscreen->info.gfx_level >= GFX11 && index == PIPE_STAT_QUERY_GS_PRIMITIVES)
701          query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
702       break;
703    default:
704       assert(0);
705       FREE(query);
706       return NULL;
707    }
708 
709    return (struct pipe_query *)query;
710 }
711 
si_update_occlusion_query_state(struct si_context * sctx,unsigned type,int diff)712 static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
713 {
714    if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
715        type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
716       switch (type) {
717       case PIPE_QUERY_OCCLUSION_COUNTER:
718          sctx->num_integer_occlusion_queries += diff;
719          break;
720       case PIPE_QUERY_OCCLUSION_PREDICATE:
721          sctx->num_boolean_occlusion_queries += diff;
722          break;
723       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
724          sctx->num_conservative_occlusion_queries += diff;
725          break;
726       }
727 
728       assert(sctx->num_integer_occlusion_queries >= 0);
729       assert(sctx->num_boolean_occlusion_queries >= 0);
730       assert(sctx->num_conservative_occlusion_queries >= 0);
731 
732       enum si_occlusion_query_mode new_mode =
733          sctx->num_integer_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER :
734          sctx->num_boolean_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN :
735          sctx->num_conservative_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN :
736          SI_OCCLUSION_QUERY_MODE_DISABLE;
737 
738       /* Conservative queries are only available on gfx10+. On gfx11+, they perform worse
739        * with late Z, but not early Z. Instead of trying to detect late Z, never enable
740        * conservative queries to keep it simple. This is the recommended programming.
741        */
742       if (new_mode == SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN &&
743           (sctx->gfx_level < GFX10 || sctx ->gfx_level >= GFX11))
744          new_mode = SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN;
745 
746       if (sctx->occlusion_query_mode != new_mode) {
747          si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
748 
749          if (sctx->screen->info.has_out_of_order_rast &&
750              (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER) !=
751              (new_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER))
752             si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
753 
754          sctx->occlusion_query_mode = new_mode;
755       }
756    }
757 }
758 
event_type_for_stream(unsigned stream)759 static unsigned event_type_for_stream(unsigned stream)
760 {
761    switch (stream) {
762    default:
763    case 0:
764       return V_028A90_SAMPLE_STREAMOUTSTATS;
765    case 1:
766       return V_028A90_SAMPLE_STREAMOUTSTATS1;
767    case 2:
768       return V_028A90_SAMPLE_STREAMOUTSTATS2;
769    case 3:
770       return V_028A90_SAMPLE_STREAMOUTSTATS3;
771    }
772 }
773 
emit_sample_streamout(struct radeon_cmdbuf * cs,uint64_t va,unsigned stream)774 static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
775 {
776    radeon_begin(cs);
777    radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
778    radeon_emit(EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
779    radeon_emit(va);
780    radeon_emit(va >> 32);
781    radeon_end();
782 }
783 
si_query_hw_do_emit_start(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)784 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
785                                       struct si_resource *buffer, uint64_t va)
786 {
787    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
788 
789    switch (query->b.type) {
790    case PIPE_QUERY_OCCLUSION_COUNTER:
791    case PIPE_QUERY_OCCLUSION_PREDICATE:
792    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
793       radeon_begin(cs);
794       if (sctx->gfx_level >= GFX11 &&
795           sctx->screen->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
796          radeon_emit(PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
797       } else {
798          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
799          if (sctx->gfx_level >= GFX11)
800             radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
801          else
802             radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
803       }
804       radeon_emit(va);
805       radeon_emit(va >> 32);
806       radeon_end();
807       break;
808    }
809    case PIPE_QUERY_PRIMITIVES_EMITTED:
810    case PIPE_QUERY_PRIMITIVES_GENERATED:
811    case PIPE_QUERY_SO_STATISTICS:
812    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
813       emit_sample_streamout(cs, va, query->stream);
814       break;
815    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
816       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
817          emit_sample_streamout(cs, va + 32 * stream, stream);
818       break;
819    case PIPE_QUERY_TIME_ELAPSED:
820       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
821                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
822       break;
823    case PIPE_QUERY_PIPELINE_STATISTICS: {
824       if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
825          /* The hw GS primitive counter doesn't work when ngg is active.
826           * So if use_ngg is true, we don't use the hw version but instead
827           * emulate it in the GS shader.
828           * The value is written at the same position, so we don't need to
829           * change anything else.
830           * If ngg is enabled for the draw, the primitive count is written in
831           * gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of exported
832           * vertices is stored in gs_emitted_vertices and the number of prim
833           * is computed based on the output prim type in emit_gs_epilogue.
834           */
835          struct pipe_shader_buffer sbuf;
836          sbuf.buffer = &buffer->b.b;
837          sbuf.buffer_offset = query->buffer.results_end;
838          sbuf.buffer_size = buffer->bo_size;
839          si_set_internal_shader_buffer(sctx, SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf);
840          SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 1);
841 
842          const uint32_t zero = 0;
843          radeon_begin(cs);
844          /* Clear the emulated counter end value. We don't clear start because it's unused. */
845          va += si_query_pipestat_end_dw_offset(sctx->screen, query->index) * 4;
846          radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0));
847          radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
848          radeon_emit(va);
849          radeon_emit(va >> 32);
850          radeon_emit(zero);
851          radeon_end();
852 
853          sctx->num_pipeline_stat_emulated_queries++;
854       } else {
855          radeon_begin(cs);
856          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
857          radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
858          radeon_emit(va);
859          radeon_emit(va >> 32);
860          radeon_end();
861       }
862       break;
863    }
864    default:
865       assert(0);
866    }
867    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
868                              RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
869 }
870 
si_update_hw_pipeline_stats(struct si_context * sctx,unsigned type,int diff)871 static void si_update_hw_pipeline_stats(struct si_context *sctx, unsigned type, int diff)
872 {
873    if (type == PIPE_QUERY_PIPELINE_STATISTICS ||
874        /* All streamout queries: */
875        type == PIPE_QUERY_PRIMITIVES_GENERATED ||
876        type == PIPE_QUERY_PRIMITIVES_EMITTED ||
877        type == PIPE_QUERY_SO_STATISTICS ||
878        type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
879        type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
880       if (type == PIPE_QUERY_PIPELINE_STATISTICS)
881          sctx->num_pipeline_stat_queries += diff;
882 
883       /* Increment for pipeline statistics and streamout queries. */
884       sctx->num_hw_pipestat_streamout_queries += diff;
885 
886       /* Enable/disable pipeline stats if we have any queries. */
887       if (diff == 1 && sctx->num_hw_pipestat_streamout_queries == 1) {
888          sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
889          sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
890          si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
891       } else if (diff == -1 && sctx->num_hw_pipestat_streamout_queries == 0) {
892          sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
893          sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
894          si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
895       }
896    }
897 }
898 
si_query_hw_emit_start(struct si_context * sctx,struct si_query_hw * query)899 static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
900 {
901    uint64_t va;
902 
903    if (!query->buffer.buf && query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
904       si_resource_reference(&query->buffer.buf, sctx->pipeline_stats_query_buf);
905 
906    /* Don't realloc pipeline_stats_query_buf */
907    if ((!(query->flags & SI_QUERY_EMULATE_GS_COUNTERS) || !sctx->pipeline_stats_query_buf) &&
908        !si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
909       return;
910 
911    if (query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
912       si_resource_reference(&sctx->pipeline_stats_query_buf, query->buffer.buf);
913 
914    si_update_occlusion_query_state(sctx, query->b.type, 1);
915    si_update_prims_generated_query_state(sctx, query->b.type, 1);
916    si_update_hw_pipeline_stats(sctx, query->b.type, 1);
917 
918    si_need_gfx_cs_space(sctx, 0);
919 
920    va = query->buffer.buf->gpu_address + query->buffer.results_end;
921    query->ops->emit_start(sctx, query, query->buffer.buf, va);
922 }
923 
si_query_hw_do_emit_stop(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)924 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
925                                      struct si_resource *buffer, uint64_t va)
926 {
927    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
928    uint64_t fence_va = 0;
929 
930    switch (query->b.type) {
931    case PIPE_QUERY_OCCLUSION_COUNTER:
932    case PIPE_QUERY_OCCLUSION_PREDICATE:
933    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
934       va += 8;
935       radeon_begin(cs);
936       if (sctx->gfx_level >= GFX11 &&
937           sctx->screen->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
938          radeon_emit(PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
939       } else {
940          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
941          if (sctx->gfx_level >= GFX11)
942             radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
943          else
944             radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
945       }
946       radeon_emit(va);
947       radeon_emit(va >> 32);
948       radeon_end();
949 
950       fence_va = va + sctx->screen->info.max_render_backends * 16 - 8;
951       break;
952    }
953    case PIPE_QUERY_PRIMITIVES_EMITTED:
954    case PIPE_QUERY_PRIMITIVES_GENERATED:
955    case PIPE_QUERY_SO_STATISTICS:
956    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
957       va += 16;
958       emit_sample_streamout(cs, va, query->stream);
959       break;
960    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
961       va += 16;
962       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
963          emit_sample_streamout(cs, va + 32 * stream, stream);
964       break;
965    case PIPE_QUERY_TIME_ELAPSED:
966       va += 8;
967       FALLTHROUGH;
968    case PIPE_QUERY_TIMESTAMP:
969       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
970                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
971       fence_va = va + 8;
972       break;
973    case PIPE_QUERY_PIPELINE_STATISTICS: {
974       unsigned sample_size = (query->result_size - 8) / 2;
975 
976       va += sample_size;
977 
978       radeon_begin(cs);
979       if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
980          radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
981          radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
982 
983          if (--sctx->num_pipeline_stat_emulated_queries == 0) {
984             si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
985             SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 0);
986          }
987       } else {
988          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
989          radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
990          radeon_emit(va);
991          radeon_emit(va >> 32);
992       }
993       radeon_end();
994 
995       fence_va = va + sample_size;
996       break;
997    }
998    default:
999       assert(0);
1000    }
1001    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
1002                              RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
1003 
1004    if (fence_va) {
1005       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
1006                         EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
1007                         query->b.type);
1008    }
1009 }
1010 
si_query_hw_emit_stop(struct si_context * sctx,struct si_query_hw * query)1011 static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
1012 {
1013    uint64_t va;
1014 
1015    /* The queries which need begin already called this in begin_query. */
1016    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1017       si_need_gfx_cs_space(sctx, 0);
1018       if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
1019                                  query->result_size))
1020          return;
1021    }
1022 
1023    if (!query->buffer.buf)
1024       return; // previous buffer allocation failure
1025 
1026    /* emit end query */
1027    va = query->buffer.buf->gpu_address + query->buffer.results_end;
1028 
1029    query->ops->emit_stop(sctx, query, query->buffer.buf, va);
1030 
1031    query->buffer.results_end += query->result_size;
1032 
1033    si_update_occlusion_query_state(sctx, query->b.type, -1);
1034    si_update_prims_generated_query_state(sctx, query->b.type, -1);
1035    si_update_hw_pipeline_stats(sctx, query->b.type, -1);
1036 }
1037 
emit_set_predicate(struct si_context * ctx,struct si_resource * buf,uint64_t va,uint32_t op)1038 static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
1039                                uint32_t op)
1040 {
1041    struct radeon_cmdbuf *cs = &ctx->gfx_cs;
1042 
1043    radeon_begin(cs);
1044 
1045    if (ctx->gfx_level >= GFX9) {
1046       radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
1047       radeon_emit(op);
1048       radeon_emit(va);
1049       radeon_emit(va >> 32);
1050    } else {
1051       radeon_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
1052       radeon_emit(va);
1053       radeon_emit(op | ((va >> 32) & 0xFF));
1054    }
1055    radeon_end();
1056 
1057    radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ | RADEON_PRIO_QUERY);
1058 }
1059 
si_emit_query_predication(struct si_context * ctx,unsigned index)1060 static void si_emit_query_predication(struct si_context *ctx, unsigned index)
1061 {
1062    uint32_t op;
1063    bool flag_wait, invert;
1064 
1065    struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
1066    if (!query)
1067       return;
1068 
1069    invert = ctx->render_cond_invert;
1070    flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
1071                ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
1072 
1073    if (ctx->gfx_level >= GFX11 && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1074                                    query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
1075       struct gfx11_sh_query *gfx10_query = (struct gfx11_sh_query *)query;
1076       struct gfx11_sh_query_buffer *qbuf, *first, *last;
1077 
1078       op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1079 
1080       /* if true then invert, see GL_ARB_conditional_render_inverted */
1081       if (!invert)
1082          op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1083       else
1084          op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1085 
1086       op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1087 
1088       first = gfx10_query->first;
1089       last = gfx10_query->last;
1090 
1091       while (first) {
1092          qbuf = first;
1093          if (first != last)
1094             first = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
1095          else
1096             first = NULL;
1097 
1098          unsigned results_base = gfx10_query->first_begin;
1099          uint64_t va_base = qbuf->buf->gpu_address;
1100          uint64_t va = va_base + results_base;
1101 
1102          unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
1103          unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
1104 
1105          unsigned count = (end - begin) / sizeof(struct gfx11_sh_query_buffer_mem);
1106          do {
1107             if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1108                for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1109                   emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
1110 
1111                   /* set CONTINUE bit for all packets except the first */
1112                   op |= PREDICATION_CONTINUE;
1113                }
1114             } else {
1115                emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
1116                op |= PREDICATION_CONTINUE;
1117             }
1118 
1119             results_base += sizeof(struct gfx11_sh_query_buffer_mem);
1120          } while (count--);
1121       }
1122    } else {
1123       struct si_query_buffer *qbuf;
1124 
1125       if (query->workaround_buf) {
1126          op = PRED_OP(PREDICATION_OP_BOOL64);
1127       } else {
1128          switch (query->b.type) {
1129          case PIPE_QUERY_OCCLUSION_COUNTER:
1130          case PIPE_QUERY_OCCLUSION_PREDICATE:
1131          case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1132             op = PRED_OP(PREDICATION_OP_ZPASS);
1133             break;
1134          case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1135          case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1136             op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1137             invert = !invert;
1138             break;
1139          default:
1140             assert(0);
1141             return;
1142          }
1143       }
1144 
1145       /* if true then invert, see GL_ARB_conditional_render_inverted */
1146       if (invert)
1147          op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1148       else
1149          op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1150 
1151       /* Use the value written by compute shader as a workaround. Note that
1152        * the wait flag does not apply in this predication mode.
1153        *
1154        * The shader outputs the result value to L2. Workarounds only affect GFX8
1155        * and later, where the CP reads data from L2, so we don't need an
1156        * additional flush.
1157        */
1158       if (query->workaround_buf) {
1159          uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1160          emit_set_predicate(ctx, query->workaround_buf, va, op);
1161          return;
1162       }
1163 
1164       op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1165 
1166       /* emit predicate packets for all data blocks */
1167       for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1168          unsigned results_base = 0;
1169          uint64_t va_base = qbuf->buf->gpu_address;
1170 
1171          while (results_base < qbuf->results_end) {
1172             uint64_t va = va_base + results_base;
1173 
1174             if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1175                for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1176                   emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1177 
1178                   /* set CONTINUE bit for all packets except the first */
1179                   op |= PREDICATION_CONTINUE;
1180                }
1181             } else {
1182                emit_set_predicate(ctx, qbuf->buf, va, op);
1183                op |= PREDICATION_CONTINUE;
1184             }
1185 
1186             results_base += query->result_size;
1187          }
1188       }
1189    }
1190 }
1191 
si_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)1192 static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1193                                           unsigned index)
1194 {
1195    struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1196 
1197    if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1198        (query_type >= PIPE_QUERY_DRIVER_SPECIFIC))
1199       return si_query_sw_create(query_type);
1200 
1201    if (sscreen->info.gfx_level >= GFX11 &&
1202        (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1203         query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1204         query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1205         query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1206       return gfx11_sh_query_create(sscreen, query_type, index);
1207 
1208    return si_query_hw_create(sscreen, query_type, index);
1209 }
1210 
si_destroy_query(struct pipe_context * ctx,struct pipe_query * query)1211 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1212 {
1213    struct si_context *sctx = (struct si_context *)ctx;
1214    struct si_query *squery = (struct si_query *)query;
1215 
1216    squery->ops->destroy(sctx, squery);
1217 }
1218 
si_begin_query(struct pipe_context * ctx,struct pipe_query * query)1219 static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1220 {
1221    struct si_context *sctx = (struct si_context *)ctx;
1222    struct si_query *squery = (struct si_query *)query;
1223 
1224    return squery->ops->begin(sctx, squery);
1225 }
1226 
si_query_hw_begin(struct si_context * sctx,struct si_query * squery)1227 static bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1228 {
1229    struct si_query_hw *query = (struct si_query_hw *)squery;
1230 
1231    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1232       assert(0);
1233       return false;
1234    }
1235 
1236    if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1237       si_query_buffer_reset(sctx, &query->buffer);
1238 
1239    si_resource_reference(&query->workaround_buf, NULL);
1240 
1241    si_query_hw_emit_start(sctx, query);
1242    if (!query->buffer.buf)
1243       return false;
1244 
1245    list_addtail(&query->b.active_list, &sctx->active_queries);
1246    sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1247    return true;
1248 }
1249 
si_end_query(struct pipe_context * ctx,struct pipe_query * query)1250 static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1251 {
1252    struct si_context *sctx = (struct si_context *)ctx;
1253    struct si_query *squery = (struct si_query *)query;
1254 
1255    return squery->ops->end(sctx, squery);
1256 }
1257 
si_query_hw_end(struct si_context * sctx,struct si_query * squery)1258 static bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1259 {
1260    struct si_query_hw *query = (struct si_query_hw *)squery;
1261 
1262    if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1263       si_query_buffer_reset(sctx, &query->buffer);
1264 
1265    si_query_hw_emit_stop(sctx, query);
1266 
1267    if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1268       list_delinit(&query->b.active_list);
1269       sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1270    }
1271 
1272    if (!query->buffer.buf)
1273       return false;
1274 
1275    return true;
1276 }
1277 
si_get_hw_query_params(struct si_context * sctx,struct si_query_hw * squery,int index,struct si_hw_query_params * params)1278 static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
1279                                    struct si_hw_query_params *params)
1280 {
1281    unsigned max_rbs = sctx->screen->info.max_render_backends;
1282 
1283    params->pair_stride = 0;
1284    params->pair_count = 1;
1285 
1286    switch (squery->b.type) {
1287    case PIPE_QUERY_OCCLUSION_COUNTER:
1288    case PIPE_QUERY_OCCLUSION_PREDICATE:
1289    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1290       params->start_offset = 0;
1291       params->end_offset = 8;
1292       params->fence_offset = max_rbs * 16;
1293       params->pair_stride = 16;
1294       params->pair_count = max_rbs;
1295       break;
1296    case PIPE_QUERY_TIME_ELAPSED:
1297       params->start_offset = 0;
1298       params->end_offset = 8;
1299       params->fence_offset = 16;
1300       break;
1301    case PIPE_QUERY_TIMESTAMP:
1302       params->start_offset = 0;
1303       params->end_offset = 0;
1304       params->fence_offset = 8;
1305       break;
1306    case PIPE_QUERY_PRIMITIVES_EMITTED:
1307       params->start_offset = 8;
1308       params->end_offset = 24;
1309       params->fence_offset = params->end_offset + 4;
1310       break;
1311    case PIPE_QUERY_PRIMITIVES_GENERATED:
1312       params->start_offset = 0;
1313       params->end_offset = 16;
1314       params->fence_offset = params->end_offset + 4;
1315       break;
1316    case PIPE_QUERY_SO_STATISTICS:
1317       params->start_offset = 8 - index * 8;
1318       params->end_offset = 24 - index * 8;
1319       params->fence_offset = params->end_offset + 4;
1320       break;
1321    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1322       params->pair_count = SI_MAX_STREAMS;
1323       params->pair_stride = 32;
1324       FALLTHROUGH;
1325    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1326       params->start_offset = 0;
1327       params->end_offset = 16;
1328 
1329       /* We can re-use the high dword of the last 64-bit value as a
1330        * fence: it is initialized as 0, and the high bit is set by
1331        * the write of the streamout stats event.
1332        */
1333       params->fence_offset = squery->result_size - 4;
1334       break;
1335    case PIPE_QUERY_PIPELINE_STATISTICS: {
1336       params->start_offset = si_query_pipestat_dw_offset(index) * 4;
1337       params->end_offset = si_query_pipestat_end_dw_offset(sctx->screen, index) * 4;
1338       params->fence_offset = si_query_pipestats_num_results(sctx->screen) * 16;
1339       break;
1340    }
1341    default:
1342       unreachable("si_get_hw_query_params unsupported");
1343    }
1344 }
1345 
si_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1346 static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1347                                      bool test_status_bit)
1348 {
1349    uint32_t *current_result = (uint32_t *)map;
1350    uint64_t start, end;
1351 
1352    start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1353    end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1354 
1355    if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1356       return end - start;
1357    }
1358    return 0;
1359 }
1360 
si_query_hw_add_result(struct si_screen * sscreen,struct si_query_hw * query,void * buffer,union pipe_query_result * result)1361 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1362                                    void *buffer, union pipe_query_result *result)
1363 {
1364    unsigned max_rbs = sscreen->info.max_render_backends;
1365 
1366    switch (query->b.type) {
1367    case PIPE_QUERY_OCCLUSION_COUNTER: {
1368       for (unsigned i = 0; i < max_rbs; ++i) {
1369          unsigned results_base = i * 16;
1370          result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1371       }
1372       break;
1373    }
1374    case PIPE_QUERY_OCCLUSION_PREDICATE:
1375    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1376       for (unsigned i = 0; i < max_rbs; ++i) {
1377          unsigned results_base = i * 16;
1378          result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1379       }
1380       break;
1381    }
1382    case PIPE_QUERY_TIME_ELAPSED:
1383       result->u64 += si_query_read_result(buffer, 0, 2, false);
1384       break;
1385    case PIPE_QUERY_TIMESTAMP:
1386       result->u64 = *(uint64_t *)buffer;
1387       break;
1388    case PIPE_QUERY_PRIMITIVES_EMITTED:
1389       /* SAMPLE_STREAMOUTSTATS stores this structure:
1390        * {
1391        *    u64 NumPrimitivesWritten;
1392        *    u64 PrimitiveStorageNeeded;
1393        * }
1394        * We only need NumPrimitivesWritten here. */
1395       result->u64 += si_query_read_result(buffer, 2, 6, true);
1396       break;
1397    case PIPE_QUERY_PRIMITIVES_GENERATED:
1398       /* Here we read PrimitiveStorageNeeded. */
1399       result->u64 += si_query_read_result(buffer, 0, 4, true);
1400       break;
1401    case PIPE_QUERY_SO_STATISTICS:
1402       result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1403       result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1404       break;
1405    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1406       result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1407                                   si_query_read_result(buffer, 0, 4, true);
1408       break;
1409    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1410       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1411          result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1412                                      si_query_read_result(buffer, 0, 4, true);
1413          buffer = (char *)buffer + 32;
1414       }
1415       break;
1416    case PIPE_QUERY_PIPELINE_STATISTICS:
1417       for (int i = 0; i < 11; i++) {
1418          result->pipeline_statistics.counters[i] +=
1419             si_query_read_result(buffer, si_query_pipestat_dw_offset(i),
1420                                  si_query_pipestat_end_dw_offset(sscreen, i), false);
1421       }
1422 #if 0 /* for testing */
1423       printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1424              "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1425              "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1426              result->pipeline_statistics.ia_vertices,
1427              result->pipeline_statistics.ia_primitives,
1428              result->pipeline_statistics.vs_invocations,
1429              result->pipeline_statistics.hs_invocations,
1430              result->pipeline_statistics.ds_invocations,
1431              result->pipeline_statistics.gs_invocations,
1432              result->pipeline_statistics.gs_primitives,
1433              result->pipeline_statistics.c_invocations,
1434              result->pipeline_statistics.c_primitives,
1435              result->pipeline_statistics.ps_invocations,
1436              result->pipeline_statistics.cs_invocations);
1437 #endif
1438       break;
1439    default:
1440       assert(0);
1441    }
1442 }
1443 
si_query_hw_suspend(struct si_context * sctx,struct si_query * query)1444 static void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1445 {
1446    si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1447 }
1448 
si_query_hw_resume(struct si_context * sctx,struct si_query * query)1449 static void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1450 {
1451    si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1452 }
1453 
si_get_query_result(struct pipe_context * ctx,struct pipe_query * query,bool wait,union pipe_query_result * result)1454 static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1455                                 union pipe_query_result *result)
1456 {
1457    struct si_context *sctx = (struct si_context *)ctx;
1458    struct si_query *squery = (struct si_query *)query;
1459 
1460    return squery->ops->get_result(sctx, squery, wait, result);
1461 }
1462 
si_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1463 static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1464                                          enum pipe_query_flags flags, enum pipe_query_value_type result_type,
1465                                          int index, struct pipe_resource *resource, unsigned offset)
1466 {
1467    struct si_context *sctx = (struct si_context *)ctx;
1468    struct si_query *squery = (struct si_query *)query;
1469 
1470    squery->ops->get_result_resource(sctx, squery, flags, result_type, index, resource, offset);
1471 }
1472 
si_query_hw_clear_result(struct si_query_hw * query,union pipe_query_result * result)1473 static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1474 {
1475    util_query_clear_result(result, query->b.type);
1476 }
1477 
si_query_hw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1478 static bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1479                                    union pipe_query_result *result)
1480 {
1481    struct si_screen *sscreen = sctx->screen;
1482    struct si_query_hw *query = (struct si_query_hw *)squery;
1483    struct si_query_buffer *qbuf;
1484 
1485    query->ops->clear_result(query, result);
1486 
1487    for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1488       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1489       unsigned results_base = 0;
1490       void *map;
1491 
1492       if (squery->b.flushed)
1493          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
1494       else
1495          map = si_buffer_map(sctx, qbuf->buf, usage);
1496 
1497       if (!map)
1498          return false;
1499 
1500       while (results_base != qbuf->results_end) {
1501          query->ops->add_result(sscreen, query, map + results_base, result);
1502          results_base += query->result_size;
1503       }
1504    }
1505 
1506    /* Convert the time to expected units. */
1507    if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
1508        squery->type == PIPE_QUERY_TIMESTAMP) {
1509       result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1510    }
1511    return true;
1512 }
1513 
si_query_hw_get_result_resource(struct si_context * sctx,struct si_query * squery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1514 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1515                                             enum pipe_query_flags flags,
1516                                             enum pipe_query_value_type result_type,
1517                                             int index, struct pipe_resource *resource,
1518                                             unsigned offset)
1519 {
1520    struct si_query_hw *query = (struct si_query_hw *)squery;
1521    struct si_query_buffer *qbuf;
1522    struct si_query_buffer *qbuf_prev;
1523    struct pipe_resource *tmp_buffer = NULL;
1524    unsigned tmp_buffer_offset = 0;
1525    struct si_qbo_state saved_state = {};
1526    struct pipe_grid_info grid = {};
1527    struct pipe_constant_buffer constant_buffer = {};
1528    struct pipe_shader_buffer ssbo[3];
1529    struct si_hw_query_params params;
1530    struct {
1531       uint32_t end_offset;
1532       uint32_t result_stride;
1533       uint32_t result_count;
1534       uint32_t config;
1535       uint32_t fence_offset;
1536       uint32_t pair_stride;
1537       uint32_t pair_count;
1538    } consts;
1539 
1540    if (!sctx->query_result_shader) {
1541       sctx->query_result_shader = si_create_query_result_cs(sctx);
1542       if (!sctx->query_result_shader)
1543          return;
1544    }
1545 
1546    if (query->buffer.previous) {
1547       u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1548       if (!tmp_buffer)
1549          return;
1550    }
1551 
1552    si_save_qbo_state(sctx, &saved_state);
1553 
1554    si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
1555    consts.end_offset = params.end_offset - params.start_offset;
1556    consts.fence_offset = params.fence_offset - params.start_offset;
1557    consts.result_stride = query->result_size;
1558    consts.pair_stride = params.pair_stride;
1559    consts.pair_count = params.pair_count;
1560 
1561    constant_buffer.buffer_size = sizeof(consts);
1562    constant_buffer.user_buffer = &consts;
1563 
1564    ssbo[1].buffer = tmp_buffer;
1565    ssbo[1].buffer_offset = tmp_buffer_offset;
1566    ssbo[1].buffer_size = 16;
1567 
1568    ssbo[2] = ssbo[1];
1569 
1570    grid.block[0] = 1;
1571    grid.block[1] = 1;
1572    grid.block[2] = 1;
1573    grid.grid[0] = 1;
1574    grid.grid[1] = 1;
1575    grid.grid[2] = 1;
1576 
1577    consts.config = 0;
1578    if (index < 0)
1579       consts.config |= 4;
1580    if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1581        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1582       consts.config |= 8;
1583    else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1584             query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1585       consts.config |= 8 | 256;
1586    else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1587       consts.config |= 32;
1588 
1589    switch (result_type) {
1590    case PIPE_QUERY_TYPE_U64:
1591    case PIPE_QUERY_TYPE_I64:
1592       consts.config |= 64;
1593       break;
1594    case PIPE_QUERY_TYPE_I32:
1595       consts.config |= 128;
1596       break;
1597    case PIPE_QUERY_TYPE_U32:
1598       break;
1599    }
1600 
1601    sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
1602    si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
1603 
1604    for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1605       if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1606          qbuf_prev = qbuf->previous;
1607          consts.result_count = qbuf->results_end / query->result_size;
1608          consts.config &= ~3;
1609          if (qbuf != &query->buffer)
1610             consts.config |= 1;
1611          if (qbuf->previous)
1612             consts.config |= 2;
1613       } else {
1614          /* Only read the last timestamp. */
1615          qbuf_prev = NULL;
1616          consts.result_count = 0;
1617          consts.config |= 16;
1618          params.start_offset += qbuf->results_end - query->result_size;
1619       }
1620 
1621       sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
1622 
1623       ssbo[0].buffer = &qbuf->buf->b.b;
1624       ssbo[0].buffer_offset = params.start_offset;
1625       ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1626 
1627       if (!qbuf->previous) {
1628          ssbo[2].buffer = resource;
1629          ssbo[2].buffer_offset = offset;
1630          ssbo[2].buffer_size = resource->width0 - offset;
1631          /* assert size is correct, based on result_type ? */
1632 
1633          si_resource(resource)->TC_L2_dirty = true;
1634       }
1635 
1636       if ((flags & PIPE_QUERY_WAIT) && qbuf == &query->buffer) {
1637          uint64_t va;
1638 
1639          /* Wait for result availability. Wait only for readiness
1640           * of the last entry, since the fence writes should be
1641           * serialized in the CP.
1642           */
1643          va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1644          va += params.fence_offset;
1645 
1646          si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1647       }
1648       si_launch_grid_internal_ssbos(sctx, &grid, sctx->query_result_shader,
1649                                     SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
1650                                     3, ssbo, 0x4);
1651    }
1652 
1653    si_restore_qbo_state(sctx, &saved_state);
1654    pipe_resource_reference(&tmp_buffer, NULL);
1655 }
1656 
si_render_condition(struct pipe_context * ctx,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1657 static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1658                                 enum pipe_render_cond_flag mode)
1659 {
1660    struct si_context *sctx = (struct si_context *)ctx;
1661    struct si_query_hw *squery = (struct si_query_hw *)query;
1662    struct si_atom *atom = &sctx->atoms.s.render_cond;
1663 
1664    if (query) {
1665       bool needs_workaround = false;
1666 
1667       /* There was a firmware regression in GFX8 which causes successive
1668        * SET_PREDICATION packets to give the wrong answer for
1669        * non-inverted stream overflow predication.
1670        */
1671       if (((sctx->gfx_level == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1672            (sctx->gfx_level == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1673           !condition &&
1674           (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1675            (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1676             (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1677          needs_workaround = true;
1678       }
1679 
1680       if (needs_workaround && !squery->workaround_buf) {
1681          bool old_render_cond_enabled = sctx->render_cond_enabled;
1682          sctx->render_cond_enabled = false;
1683 
1684          u_suballocator_alloc(&sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1685                               (struct pipe_resource **)&squery->workaround_buf);
1686 
1687          /* Reset to NULL to avoid a redundant SET_PREDICATION
1688           * from launching the compute grid.
1689           */
1690          sctx->render_cond = NULL;
1691 
1692          ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1693                                         &squery->workaround_buf->b.b, squery->workaround_offset);
1694 
1695          /* Settings this in the render cond atom is too late,
1696           * so set it here. */
1697          sctx->flags |= sctx->screen->barrier_flags.L2_to_cp;
1698          si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
1699 
1700          sctx->render_cond_enabled = old_render_cond_enabled;
1701       }
1702    }
1703 
1704    sctx->render_cond = query;
1705    sctx->render_cond_invert = condition;
1706    sctx->render_cond_mode = mode;
1707    sctx->render_cond_enabled = query;
1708 
1709    si_set_atom_dirty(sctx, atom, query != NULL);
1710 }
1711 
si_suspend_queries(struct si_context * sctx)1712 void si_suspend_queries(struct si_context *sctx)
1713 {
1714    struct si_query *query;
1715 
1716    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1717       query->ops->suspend(sctx, query);
1718 }
1719 
si_resume_queries(struct si_context * sctx)1720 void si_resume_queries(struct si_context *sctx)
1721 {
1722    struct si_query *query;
1723 
1724    /* Check CS space here. Resuming must not be interrupted by flushes. */
1725    si_need_gfx_cs_space(sctx, 0);
1726 
1727    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1728       query->ops->resume(sctx, query);
1729 }
1730 
1731 #define XFULL(name_, query_type_, type_, result_type_, group_id_)                                  \
1732    {                                                                                               \
1733       .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1734       .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_           \
1735    }
1736 
1737 #define X(name_, query_type_, type_, result_type_)                                                 \
1738    XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1739 
1740 #define XG(group_, name_, query_type_, type_, result_type_)                                        \
1741    XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1742 
1743 static struct pipe_driver_query_info si_driver_query_list[] = {
1744    X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1745    X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1746    X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1747    X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1748    X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1749    X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1750    X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1751    X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1752    X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1753    X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1754    X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1755    X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1756    X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1757    X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1758    X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1759    X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1760    X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1761    X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1762    X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1763    X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1764    X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1765    X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1766    X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1767    X("slab-wasted-VRAM", SLAB_WASTED_VRAM, BYTES, AVERAGE),
1768    X("slab-wasted-GTT", SLAB_WASTED_GTT, BYTES, AVERAGE),
1769    X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1770    X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1771    X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1772    X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1773    X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1774    X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1775    X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1776    X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1777    X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1778    X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1779    X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1780    X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1781    X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1782    X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1783    X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1784    X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1785    X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1786    X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1787 
1788    /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1789     * which use it as a fallback path to detect the GPU type.
1790     *
1791     * Note: The names of these queries are significant for GPUPerfStudio
1792     * (and possibly their order as well). */
1793    XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1794    XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1795    XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1796    XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1797    XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1798 
1799    X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1800    X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1801    X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1802 
1803    /* The following queries must be at the end of the list because their
1804     * availability is adjusted dynamically based on the DRM version. */
1805    X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1806    X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1807    X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1808    X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1809    X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1810    X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1811    X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1812    X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1813    X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1814    X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1815    X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1816    X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1817    X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1818    X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1819 
1820    /* SRBM_STATUS2 */
1821    X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1822 
1823    /* CP_STAT */
1824    X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1825    X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1826    X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1827    X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1828    X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1829    X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1830 };
1831 
1832 #undef X
1833 #undef XG
1834 #undef XFULL
1835 
si_get_num_queries(struct si_screen * sscreen)1836 static unsigned si_get_num_queries(struct si_screen *sscreen)
1837 {
1838    /* amdgpu */
1839    if (sscreen->info.is_amdgpu) {
1840       if (sscreen->info.gfx_level >= GFX8)
1841          return ARRAY_SIZE(si_driver_query_list);
1842       else
1843          return ARRAY_SIZE(si_driver_query_list) - 7;
1844    }
1845 
1846    /* radeon */
1847    if (sscreen->info.gfx_level == GFX7)
1848       return ARRAY_SIZE(si_driver_query_list) - 6;
1849    else
1850       return ARRAY_SIZE(si_driver_query_list) - 7;
1851 
1852    return ARRAY_SIZE(si_driver_query_list) - 21;
1853 }
1854 
si_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)1855 static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1856                                     struct pipe_driver_query_info *info)
1857 {
1858    struct si_screen *sscreen = (struct si_screen *)screen;
1859    unsigned num_queries = si_get_num_queries(sscreen);
1860 
1861    if (!info) {
1862       unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1863 
1864       return num_queries + num_perfcounters;
1865    }
1866 
1867    if (index >= num_queries)
1868       return si_get_perfcounter_info(sscreen, index - num_queries, info);
1869 
1870    *info = si_driver_query_list[index];
1871 
1872    switch (info->query_type) {
1873    case SI_QUERY_REQUESTED_VRAM:
1874    case SI_QUERY_VRAM_USAGE:
1875    case SI_QUERY_MAPPED_VRAM:
1876    case SI_QUERY_SLAB_WASTED_VRAM:
1877       info->max_value.u64 = (uint64_t)sscreen->info.vram_size_kb * 1024;
1878       break;
1879    case SI_QUERY_REQUESTED_GTT:
1880    case SI_QUERY_GTT_USAGE:
1881    case SI_QUERY_MAPPED_GTT:
1882    case SI_QUERY_SLAB_WASTED_GTT:
1883       info->max_value.u64 = (uint64_t)sscreen->info.gart_size_kb * 1024;
1884       break;
1885    case SI_QUERY_GPU_TEMPERATURE:
1886       info->max_value.u64 = 125;
1887       break;
1888    case SI_QUERY_VRAM_VIS_USAGE:
1889       info->max_value.u64 = (uint64_t)sscreen->info.vram_vis_size_kb * 1024;
1890       break;
1891    }
1892 
1893    if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1894       info->group_id += sscreen->perfcounters->base.num_groups;
1895 
1896    return 1;
1897 }
1898 
1899 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1900  * performance counter groups, so be careful when changing this and related
1901  * functions.
1902  */
si_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1903 static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1904                                           struct pipe_driver_query_group_info *info)
1905 {
1906    struct si_screen *sscreen = (struct si_screen *)screen;
1907    unsigned num_pc_groups = 0;
1908 
1909    if (sscreen->perfcounters)
1910       num_pc_groups = sscreen->perfcounters->base.num_groups;
1911 
1912    if (!info)
1913       return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1914 
1915    if (index < num_pc_groups)
1916       return si_get_perfcounter_group_info(sscreen, index, info);
1917 
1918    index -= num_pc_groups;
1919    if (index >= SI_NUM_SW_QUERY_GROUPS)
1920       return 0;
1921 
1922    info->name = "GPIN";
1923    info->max_active_queries = 5;
1924    info->num_queries = 5;
1925    return 1;
1926 }
1927 
1928 static const struct si_query_ops query_hw_ops = {
1929    .destroy = si_query_hw_destroy,
1930    .begin = si_query_hw_begin,
1931    .end = si_query_hw_end,
1932    .get_result = si_query_hw_get_result,
1933    .get_result_resource = si_query_hw_get_result_resource,
1934 
1935    .suspend = si_query_hw_suspend,
1936    .resume = si_query_hw_resume,
1937 };
1938 
1939 static const struct si_query_ops sw_query_ops = {
1940    .destroy = si_query_sw_destroy,
1941    .begin = si_query_sw_begin,
1942    .end = si_query_sw_end,
1943    .get_result = si_query_sw_get_result,
1944    .get_result_resource = NULL
1945 };
1946 
1947 static const struct si_query_hw_ops query_hw_default_hw_ops = {
1948    .prepare_buffer = si_query_hw_prepare_buffer,
1949    .emit_start = si_query_hw_do_emit_start,
1950    .emit_stop = si_query_hw_do_emit_stop,
1951    .clear_result = si_query_hw_clear_result,
1952    .add_result = si_query_hw_add_result,
1953 };
1954 
si_init_query_functions(struct si_context * sctx)1955 void si_init_query_functions(struct si_context *sctx)
1956 {
1957    sctx->b.create_query = si_create_query;
1958    sctx->b.create_batch_query = si_create_batch_query;
1959    sctx->b.destroy_query = si_destroy_query;
1960    sctx->b.begin_query = si_begin_query;
1961    sctx->b.end_query = si_end_query;
1962    sctx->b.get_query_result = si_get_query_result;
1963    sctx->b.get_query_result_resource = si_get_query_result_resource;
1964 
1965    if (sctx->has_graphics) {
1966       sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1967       sctx->b.render_condition = si_render_condition;
1968    }
1969 
1970    list_inithead(&sctx->active_queries);
1971 }
1972 
si_init_screen_query_functions(struct si_screen * sscreen)1973 void si_init_screen_query_functions(struct si_screen *sscreen)
1974 {
1975    sscreen->b.get_driver_query_info = si_get_driver_query_info;
1976    sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1977 }
1978