• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4  * Copyright 2018 Advanced Micro Devices, Inc.
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "si_query.h"
10 #include "si_build_pm4.h"
11 
12 #include "amd/common/sid.h"
13 #include "si_pipe.h"
14 #include "util/os_time.h"
15 #include "util/u_memory.h"
16 #include "util/u_suballoc.h"
17 #include "util/u_upload_mgr.h"
18 
19 static const struct si_query_ops hw_query_ops;
20 static const struct si_query_ops sw_query_ops;
21 
22 struct si_hw_query_params {
23    unsigned start_offset;
24    unsigned end_offset;
25    unsigned fence_offset;
26    unsigned pair_stride;
27    unsigned pair_count;
28 };
29 
30 /* Queries without buffer handling or suspend/resume. */
31 struct si_query_sw {
32    struct si_query b;
33 
34    uint64_t begin_result;
35    uint64_t end_result;
36 
37    uint64_t begin_time;
38    uint64_t end_time;
39 
40    /* Fence for GPU_FINISHED. */
41    struct pipe_fence_handle *fence;
42 };
43 
si_query_sw_destroy(struct si_context * sctx,struct si_query * squery)44 static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
45 {
46    struct si_query_sw *query = (struct si_query_sw *)squery;
47 
48    sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
49    FREE(query);
50 }
51 
winsys_id_from_type(unsigned type)52 static enum radeon_value_id winsys_id_from_type(unsigned type)
53 {
54    switch (type) {
55    case SI_QUERY_REQUESTED_VRAM:
56       return RADEON_REQUESTED_VRAM_MEMORY;
57    case SI_QUERY_REQUESTED_GTT:
58       return RADEON_REQUESTED_GTT_MEMORY;
59    case SI_QUERY_MAPPED_VRAM:
60       return RADEON_MAPPED_VRAM;
61    case SI_QUERY_MAPPED_GTT:
62       return RADEON_MAPPED_GTT;
63    case SI_QUERY_SLAB_WASTED_VRAM:
64       return RADEON_SLAB_WASTED_VRAM;
65    case SI_QUERY_SLAB_WASTED_GTT:
66       return RADEON_SLAB_WASTED_GTT;
67    case SI_QUERY_BUFFER_WAIT_TIME:
68       return RADEON_BUFFER_WAIT_TIME_NS;
69    case SI_QUERY_NUM_MAPPED_BUFFERS:
70       return RADEON_NUM_MAPPED_BUFFERS;
71    case SI_QUERY_NUM_GFX_IBS:
72       return RADEON_NUM_GFX_IBS;
73    case SI_QUERY_GFX_BO_LIST_SIZE:
74       return RADEON_GFX_BO_LIST_COUNTER;
75    case SI_QUERY_GFX_IB_SIZE:
76       return RADEON_GFX_IB_SIZE_COUNTER;
77    case SI_QUERY_NUM_BYTES_MOVED:
78       return RADEON_NUM_BYTES_MOVED;
79    case SI_QUERY_NUM_EVICTIONS:
80       return RADEON_NUM_EVICTIONS;
81    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
82       return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
83    case SI_QUERY_VRAM_USAGE:
84       return RADEON_VRAM_USAGE;
85    case SI_QUERY_VRAM_VIS_USAGE:
86       return RADEON_VRAM_VIS_USAGE;
87    case SI_QUERY_GTT_USAGE:
88       return RADEON_GTT_USAGE;
89    case SI_QUERY_GPU_TEMPERATURE:
90       return RADEON_GPU_TEMPERATURE;
91    case SI_QUERY_CURRENT_GPU_SCLK:
92       return RADEON_CURRENT_SCLK;
93    case SI_QUERY_CURRENT_GPU_MCLK:
94       return RADEON_CURRENT_MCLK;
95    case SI_QUERY_CS_THREAD_BUSY:
96       return RADEON_CS_THREAD_TIME;
97    default:
98       unreachable("query type does not correspond to winsys id");
99    }
100 }
101 
si_query_sw_begin(struct si_context * sctx,struct si_query * squery)102 static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
103 {
104    struct si_query_sw *query = (struct si_query_sw *)squery;
105    enum radeon_value_id ws_id;
106 
107    switch (query->b.type) {
108    case PIPE_QUERY_TIMESTAMP_DISJOINT:
109    case PIPE_QUERY_GPU_FINISHED:
110       break;
111    case SI_QUERY_DRAW_CALLS:
112       query->begin_result = sctx->num_draw_calls;
113       break;
114    case SI_QUERY_DECOMPRESS_CALLS:
115       query->begin_result = sctx->num_decompress_calls;
116       break;
117    case SI_QUERY_COMPUTE_CALLS:
118       query->begin_result = sctx->num_compute_calls;
119       break;
120    case SI_QUERY_CP_DMA_CALLS:
121       query->begin_result = sctx->num_cp_dma_calls;
122       break;
123    case SI_QUERY_NUM_VS_FLUSHES:
124       query->begin_result = sctx->num_vs_flushes;
125       break;
126    case SI_QUERY_NUM_PS_FLUSHES:
127       query->begin_result = sctx->num_ps_flushes;
128       break;
129    case SI_QUERY_NUM_CS_FLUSHES:
130       query->begin_result = sctx->num_cs_flushes;
131       break;
132    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
133       query->begin_result = sctx->num_cb_cache_flushes;
134       break;
135    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
136       query->begin_result = sctx->num_db_cache_flushes;
137       break;
138    case SI_QUERY_NUM_L2_INVALIDATES:
139       query->begin_result = sctx->num_L2_invalidates;
140       break;
141    case SI_QUERY_NUM_L2_WRITEBACKS:
142       query->begin_result = sctx->num_L2_writebacks;
143       break;
144    case SI_QUERY_NUM_RESIDENT_HANDLES:
145       query->begin_result = sctx->num_resident_handles;
146       break;
147    case SI_QUERY_TC_OFFLOADED_SLOTS:
148       query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
149       break;
150    case SI_QUERY_TC_DIRECT_SLOTS:
151       query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
152       break;
153    case SI_QUERY_TC_NUM_SYNCS:
154       query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
155       break;
156    case SI_QUERY_REQUESTED_VRAM:
157    case SI_QUERY_REQUESTED_GTT:
158    case SI_QUERY_MAPPED_VRAM:
159    case SI_QUERY_MAPPED_GTT:
160    case SI_QUERY_SLAB_WASTED_VRAM:
161    case SI_QUERY_SLAB_WASTED_GTT:
162    case SI_QUERY_VRAM_USAGE:
163    case SI_QUERY_VRAM_VIS_USAGE:
164    case SI_QUERY_GTT_USAGE:
165    case SI_QUERY_GPU_TEMPERATURE:
166    case SI_QUERY_CURRENT_GPU_SCLK:
167    case SI_QUERY_CURRENT_GPU_MCLK:
168    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
169    case SI_QUERY_NUM_MAPPED_BUFFERS:
170       query->begin_result = 0;
171       break;
172    case SI_QUERY_BUFFER_WAIT_TIME:
173    case SI_QUERY_GFX_IB_SIZE:
174    case SI_QUERY_NUM_GFX_IBS:
175    case SI_QUERY_NUM_BYTES_MOVED:
176    case SI_QUERY_NUM_EVICTIONS:
177    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
178       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
179       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
180       break;
181    }
182    case SI_QUERY_GFX_BO_LIST_SIZE:
183       ws_id = winsys_id_from_type(query->b.type);
184       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
185       query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
186       break;
187    case SI_QUERY_CS_THREAD_BUSY:
188       ws_id = winsys_id_from_type(query->b.type);
189       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
190       query->begin_time = os_time_get_nano();
191       break;
192    case SI_QUERY_GALLIUM_THREAD_BUSY:
193       query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
194       query->begin_time = os_time_get_nano();
195       break;
196    case SI_QUERY_GPU_LOAD:
197    case SI_QUERY_GPU_SHADERS_BUSY:
198    case SI_QUERY_GPU_TA_BUSY:
199    case SI_QUERY_GPU_GDS_BUSY:
200    case SI_QUERY_GPU_VGT_BUSY:
201    case SI_QUERY_GPU_IA_BUSY:
202    case SI_QUERY_GPU_SX_BUSY:
203    case SI_QUERY_GPU_WD_BUSY:
204    case SI_QUERY_GPU_BCI_BUSY:
205    case SI_QUERY_GPU_SC_BUSY:
206    case SI_QUERY_GPU_PA_BUSY:
207    case SI_QUERY_GPU_DB_BUSY:
208    case SI_QUERY_GPU_CP_BUSY:
209    case SI_QUERY_GPU_CB_BUSY:
210    case SI_QUERY_GPU_SDMA_BUSY:
211    case SI_QUERY_GPU_PFP_BUSY:
212    case SI_QUERY_GPU_MEQ_BUSY:
213    case SI_QUERY_GPU_ME_BUSY:
214    case SI_QUERY_GPU_SURF_SYNC_BUSY:
215    case SI_QUERY_GPU_CP_DMA_BUSY:
216    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
217       query->begin_result = si_begin_counter(sctx->screen, query->b.type);
218       break;
219    case SI_QUERY_NUM_COMPILATIONS:
220       query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
221       break;
222    case SI_QUERY_NUM_SHADERS_CREATED:
223       query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
224       break;
225    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
226       query->begin_result = sctx->screen->live_shader_cache.hits;
227       break;
228    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
229       query->begin_result = sctx->screen->live_shader_cache.misses;
230       break;
231    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
232       query->begin_result = sctx->screen->num_memory_shader_cache_hits;
233       break;
234    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
235       query->begin_result = sctx->screen->num_memory_shader_cache_misses;
236       break;
237    case SI_QUERY_DISK_SHADER_CACHE_HITS:
238       query->begin_result = sctx->screen->num_disk_shader_cache_hits;
239       break;
240    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
241       query->begin_result = sctx->screen->num_disk_shader_cache_misses;
242       break;
243    case SI_QUERY_GPIN_ASIC_ID:
244    case SI_QUERY_GPIN_NUM_SIMD:
245    case SI_QUERY_GPIN_NUM_RB:
246    case SI_QUERY_GPIN_NUM_SPI:
247    case SI_QUERY_GPIN_NUM_SE:
248       break;
249    default:
250       unreachable("si_query_sw_begin: bad query type");
251    }
252 
253    return true;
254 }
255 
si_query_sw_end(struct si_context * sctx,struct si_query * squery)256 static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
257 {
258    struct si_query_sw *query = (struct si_query_sw *)squery;
259    enum radeon_value_id ws_id;
260 
261    switch (query->b.type) {
262    case PIPE_QUERY_TIMESTAMP_DISJOINT:
263       break;
264    case PIPE_QUERY_GPU_FINISHED:
265       sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
266       break;
267    case SI_QUERY_DRAW_CALLS:
268       query->end_result = sctx->num_draw_calls;
269       break;
270    case SI_QUERY_DECOMPRESS_CALLS:
271       query->end_result = sctx->num_decompress_calls;
272       break;
273    case SI_QUERY_COMPUTE_CALLS:
274       query->end_result = sctx->num_compute_calls;
275       break;
276    case SI_QUERY_CP_DMA_CALLS:
277       query->end_result = sctx->num_cp_dma_calls;
278       break;
279    case SI_QUERY_NUM_VS_FLUSHES:
280       query->end_result = sctx->num_vs_flushes;
281       break;
282    case SI_QUERY_NUM_PS_FLUSHES:
283       query->end_result = sctx->num_ps_flushes;
284       break;
285    case SI_QUERY_NUM_CS_FLUSHES:
286       query->end_result = sctx->num_cs_flushes;
287       break;
288    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
289       query->end_result = sctx->num_cb_cache_flushes;
290       break;
291    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
292       query->end_result = sctx->num_db_cache_flushes;
293       break;
294    case SI_QUERY_NUM_L2_INVALIDATES:
295       query->end_result = sctx->num_L2_invalidates;
296       break;
297    case SI_QUERY_NUM_L2_WRITEBACKS:
298       query->end_result = sctx->num_L2_writebacks;
299       break;
300    case SI_QUERY_NUM_RESIDENT_HANDLES:
301       query->end_result = sctx->num_resident_handles;
302       break;
303    case SI_QUERY_TC_OFFLOADED_SLOTS:
304       query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
305       break;
306    case SI_QUERY_TC_DIRECT_SLOTS:
307       query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
308       break;
309    case SI_QUERY_TC_NUM_SYNCS:
310       query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
311       break;
312    case SI_QUERY_REQUESTED_VRAM:
313    case SI_QUERY_REQUESTED_GTT:
314    case SI_QUERY_MAPPED_VRAM:
315    case SI_QUERY_MAPPED_GTT:
316    case SI_QUERY_SLAB_WASTED_VRAM:
317    case SI_QUERY_SLAB_WASTED_GTT:
318    case SI_QUERY_VRAM_USAGE:
319    case SI_QUERY_VRAM_VIS_USAGE:
320    case SI_QUERY_GTT_USAGE:
321    case SI_QUERY_GPU_TEMPERATURE:
322    case SI_QUERY_CURRENT_GPU_SCLK:
323    case SI_QUERY_CURRENT_GPU_MCLK:
324    case SI_QUERY_BUFFER_WAIT_TIME:
325    case SI_QUERY_GFX_IB_SIZE:
326    case SI_QUERY_NUM_MAPPED_BUFFERS:
327    case SI_QUERY_NUM_GFX_IBS:
328    case SI_QUERY_NUM_BYTES_MOVED:
329    case SI_QUERY_NUM_EVICTIONS:
330    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
331       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
332       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
333       break;
334    }
335    case SI_QUERY_GFX_BO_LIST_SIZE:
336       ws_id = winsys_id_from_type(query->b.type);
337       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
338       query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
339       break;
340    case SI_QUERY_CS_THREAD_BUSY:
341       ws_id = winsys_id_from_type(query->b.type);
342       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
343       query->end_time = os_time_get_nano();
344       break;
345    case SI_QUERY_GALLIUM_THREAD_BUSY:
346       query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
347       query->end_time = os_time_get_nano();
348       break;
349    case SI_QUERY_GPU_LOAD:
350    case SI_QUERY_GPU_SHADERS_BUSY:
351    case SI_QUERY_GPU_TA_BUSY:
352    case SI_QUERY_GPU_GDS_BUSY:
353    case SI_QUERY_GPU_VGT_BUSY:
354    case SI_QUERY_GPU_IA_BUSY:
355    case SI_QUERY_GPU_SX_BUSY:
356    case SI_QUERY_GPU_WD_BUSY:
357    case SI_QUERY_GPU_BCI_BUSY:
358    case SI_QUERY_GPU_SC_BUSY:
359    case SI_QUERY_GPU_PA_BUSY:
360    case SI_QUERY_GPU_DB_BUSY:
361    case SI_QUERY_GPU_CP_BUSY:
362    case SI_QUERY_GPU_CB_BUSY:
363    case SI_QUERY_GPU_SDMA_BUSY:
364    case SI_QUERY_GPU_PFP_BUSY:
365    case SI_QUERY_GPU_MEQ_BUSY:
366    case SI_QUERY_GPU_ME_BUSY:
367    case SI_QUERY_GPU_SURF_SYNC_BUSY:
368    case SI_QUERY_GPU_CP_DMA_BUSY:
369    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
370       query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
371       query->begin_result = 0;
372       break;
373    case SI_QUERY_NUM_COMPILATIONS:
374       query->end_result = p_atomic_read(&sctx->screen->num_compilations);
375       break;
376    case SI_QUERY_NUM_SHADERS_CREATED:
377       query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
378       break;
379    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
380       query->end_result = sctx->last_tex_ps_draw_ratio;
381       break;
382    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
383       query->end_result = sctx->screen->live_shader_cache.hits;
384       break;
385    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
386       query->end_result = sctx->screen->live_shader_cache.misses;
387       break;
388    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
389       query->end_result = sctx->screen->num_memory_shader_cache_hits;
390       break;
391    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
392       query->end_result = sctx->screen->num_memory_shader_cache_misses;
393       break;
394    case SI_QUERY_DISK_SHADER_CACHE_HITS:
395       query->end_result = sctx->screen->num_disk_shader_cache_hits;
396       break;
397    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
398       query->end_result = sctx->screen->num_disk_shader_cache_misses;
399       break;
400    case SI_QUERY_GPIN_ASIC_ID:
401    case SI_QUERY_GPIN_NUM_SIMD:
402    case SI_QUERY_GPIN_NUM_RB:
403    case SI_QUERY_GPIN_NUM_SPI:
404    case SI_QUERY_GPIN_NUM_SE:
405       break;
406    default:
407       unreachable("si_query_sw_end: bad query type");
408    }
409 
410    return true;
411 }
412 
si_query_sw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)413 static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
414                                    union pipe_query_result *result)
415 {
416    struct si_query_sw *query = (struct si_query_sw *)squery;
417 
418    switch (query->b.type) {
419    case PIPE_QUERY_TIMESTAMP_DISJOINT:
420       /* Convert from cycles per millisecond to cycles per second (Hz). */
421       result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
422       result->timestamp_disjoint.disjoint = false;
423       return true;
424    case PIPE_QUERY_GPU_FINISHED: {
425       struct pipe_screen *screen = sctx->b.screen;
426       struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
427 
428       result->b = screen->fence_finish(screen, ctx, query->fence, wait ? OS_TIMEOUT_INFINITE : 0);
429       return result->b;
430    }
431 
432    case SI_QUERY_GFX_BO_LIST_SIZE:
433       result->u64 =
434          (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
435       return true;
436    case SI_QUERY_CS_THREAD_BUSY:
437    case SI_QUERY_GALLIUM_THREAD_BUSY:
438       result->u64 =
439          (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
440       return true;
441    case SI_QUERY_GPIN_ASIC_ID:
442       result->u32 = 0;
443       return true;
444    case SI_QUERY_GPIN_NUM_SIMD:
445       result->u32 = sctx->screen->info.num_cu;
446       return true;
447    case SI_QUERY_GPIN_NUM_RB:
448       result->u32 = sctx->screen->info.max_render_backends;
449       return true;
450    case SI_QUERY_GPIN_NUM_SPI:
451       result->u32 = 1; /* all supported chips have one SPI per SE */
452       return true;
453    case SI_QUERY_GPIN_NUM_SE:
454       result->u32 = sctx->screen->info.max_se;
455       return true;
456    }
457 
458    result->u64 = query->end_result - query->begin_result;
459 
460    switch (query->b.type) {
461    case SI_QUERY_BUFFER_WAIT_TIME:
462    case SI_QUERY_GPU_TEMPERATURE:
463       result->u64 /= 1000;
464       break;
465    case SI_QUERY_CURRENT_GPU_SCLK:
466    case SI_QUERY_CURRENT_GPU_MCLK:
467       result->u64 *= 1000000;
468       break;
469    }
470 
471    return true;
472 }
473 
si_query_sw_create(unsigned query_type)474 static struct pipe_query *si_query_sw_create(unsigned query_type)
475 {
476    struct si_query_sw *query;
477 
478    query = CALLOC_STRUCT(si_query_sw);
479    if (!query)
480       return NULL;
481 
482    query->b.type = query_type;
483    query->b.ops = &sw_query_ops;
484 
485    return (struct pipe_query *)query;
486 }
487 
si_query_buffer_destroy(struct si_screen * sscreen,struct si_query_buffer * buffer)488 void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
489 {
490    struct si_query_buffer *prev = buffer->previous;
491 
492    /* Release all query buffers. */
493    while (prev) {
494       struct si_query_buffer *qbuf = prev;
495       prev = prev->previous;
496       si_resource_reference(&qbuf->buf, NULL);
497       FREE(qbuf);
498    }
499 
500    si_resource_reference(&buffer->buf, NULL);
501 }
502 
si_query_buffer_reset(struct si_context * sctx,struct si_query_buffer * buffer)503 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
504 {
505    /* Discard all query buffers except for the oldest. */
506    while (buffer->previous) {
507       struct si_query_buffer *qbuf = buffer->previous;
508       buffer->previous = qbuf->previous;
509 
510       si_resource_reference(&buffer->buf, NULL);
511       buffer->buf = qbuf->buf; /* move ownership */
512       FREE(qbuf);
513    }
514    buffer->results_end = 0;
515 
516    if (!buffer->buf)
517       return;
518 
519    /* Discard even the oldest buffer if it can't be mapped without a stall. */
520    if (si_cs_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
521        !sctx->ws->buffer_wait(sctx->ws, buffer->buf->buf, 0,
522                               RADEON_USAGE_READWRITE | RADEON_USAGE_DISALLOW_SLOW_REPLY)) {
523       si_resource_reference(&buffer->buf, NULL);
524    }
525 }
526 
si_query_buffer_alloc(struct si_context * sctx,struct si_query_buffer * buffer,bool (* prepare_buffer)(struct si_context *,struct si_query_buffer *),unsigned size)527 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
528                            bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
529                            unsigned size)
530 {
531    if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
532       if (buffer->buf) {
533          struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
534          memcpy(qbuf, buffer, sizeof(*qbuf));
535          buffer->previous = qbuf;
536       }
537       buffer->results_end = 0;
538 
539       /* Queries are normally read by the CPU after
540        * being written by the gpu, hence staging is probably a good
541        * usage pattern.
542        */
543       struct si_screen *screen = sctx->screen;
544       unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
545 
546       /* We need to bypass GL2 for queries if SET_PREDICATION accesses it uncached
547        * in a spinloop.
548        */
549       buffer->buf =  si_aligned_buffer_create(&screen->b,
550                                               screen->info.cp_sdma_ge_use_system_memory_scope ?
551                                                  SI_RESOURCE_FLAG_GL2_BYPASS : 0,
552                                               PIPE_USAGE_STAGING, buf_size, 256);
553       if (unlikely(!buffer->buf))
554          return false;
555    }
556 
557    if (!buffer->results_end && prepare_buffer) {
558       if (unlikely(!prepare_buffer(sctx, buffer))) {
559          si_resource_reference(&buffer->buf, NULL);
560          return false;
561       }
562    }
563 
564    return true;
565 }
566 
si_query_hw_destroy(struct si_context * sctx,struct si_query * squery)567 static void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
568 {
569    struct si_query_hw *query = (struct si_query_hw *)squery;
570 
571    si_query_buffer_destroy(sctx->screen, &query->buffer);
572    si_resource_reference(&query->workaround_buf, NULL);
573    FREE(squery);
574 }
575 
si_query_hw_prepare_buffer(struct si_context * sctx,struct si_query_buffer * qbuf)576 static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
577 {
578    struct si_query_hw *query = container_of(qbuf, struct si_query_hw, buffer);
579    struct si_screen *screen = sctx->screen;
580 
581    /* The caller ensures that the buffer is currently unused by the GPU. */
582    uint32_t *results = screen->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
583                                               PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
584    if (!results)
585       return false;
586 
587    memset(results, 0, qbuf->buf->b.b.width0);
588 
589    if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
590        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
591        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
592       unsigned max_rbs = screen->info.max_render_backends;
593       uint64_t enabled_rb_mask = screen->info.enabled_rb_mask;
594       unsigned num_results;
595       unsigned i, j;
596 
597       /* Set top bits for unused backends. */
598       num_results = qbuf->buf->b.b.width0 / query->result_size;
599       for (j = 0; j < num_results; j++) {
600          for (i = 0; i < max_rbs; i++) {
601             if (!(enabled_rb_mask & (1ull << i))) {
602                results[(i * 4) + 1] = 0x80000000;
603                results[(i * 4) + 3] = 0x80000000;
604             }
605          }
606          results += 4 * max_rbs;
607       }
608    }
609 
610    return true;
611 }
612 
si_query_pipestats_num_results(struct si_screen * sscreen)613 static unsigned si_query_pipestats_num_results(struct si_screen *sscreen)
614 {
615    return sscreen->info.gfx_level >= GFX11 ? 14 : 11;
616 }
617 
si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)618 static unsigned si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)
619 {
620    switch (index) {
621    case PIPE_STAT_QUERY_PS_INVOCATIONS: return 0;
622    case PIPE_STAT_QUERY_C_PRIMITIVES: return 2;
623    case PIPE_STAT_QUERY_C_INVOCATIONS: return 4;
624    case PIPE_STAT_QUERY_VS_INVOCATIONS: return 6;
625    case PIPE_STAT_QUERY_GS_INVOCATIONS: return 8;
626    case PIPE_STAT_QUERY_GS_PRIMITIVES: return 10;
627    case PIPE_STAT_QUERY_IA_PRIMITIVES: return 12;
628    case PIPE_STAT_QUERY_IA_VERTICES: return 14;
629    case PIPE_STAT_QUERY_HS_INVOCATIONS: return 16;
630    case PIPE_STAT_QUERY_DS_INVOCATIONS: return 18;
631    case PIPE_STAT_QUERY_CS_INVOCATIONS: return 20;
632    /* gfx11: MS_INVOCATIONS */
633    /* gfx11: MS_PRIMITIVES */
634    /* gfx11: TS_INVOCATIONS */
635    default:
636       assert(false);
637    }
638    return ~0;
639 }
640 
si_query_pipestat_end_dw_offset(struct si_screen * sscreen,enum pipe_statistics_query_index index)641 unsigned si_query_pipestat_end_dw_offset(struct si_screen *sscreen,
642                                          enum pipe_statistics_query_index index)
643 {
644    return si_query_pipestats_num_results(sscreen) * 2 + si_query_pipestat_dw_offset(index);
645 }
646 
si_query_hw_create(struct si_screen * sscreen,unsigned query_type,unsigned index)647 static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
648                                              unsigned index)
649 {
650    struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
651    if (!query)
652       return NULL;
653 
654    query->b.type = query_type;
655    query->b.ops = &hw_query_ops;
656 
657    switch (query_type) {
658    case PIPE_QUERY_OCCLUSION_COUNTER:
659    case PIPE_QUERY_OCCLUSION_PREDICATE:
660    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
661       query->result_size = 16 * sscreen->info.max_render_backends;
662       query->result_size += 16; /* for the fence + alignment */
663       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
664       break;
665    case PIPE_QUERY_TIME_ELAPSED:
666       query->result_size = 16;
667       query->result_size += 8; /* for fence */
668       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
669       break;
670    case PIPE_QUERY_TIMESTAMP:
671       query->result_size = 8;
672       query->result_size += 8; /* for fence */
673       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
674       query->flags = SI_QUERY_HW_FLAG_NO_START;
675       break;
676    case PIPE_QUERY_PRIMITIVES_EMITTED:
677    case PIPE_QUERY_PRIMITIVES_GENERATED:
678    case PIPE_QUERY_SO_STATISTICS:
679    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
680       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
681       /* the 64th bit in qw is used as fence. it is set by hardware in streamout stats event. */
682       query->result_size = 32;
683       query->b.num_cs_dw_suspend = 6;
684       query->stream = index;
685       break;
686    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
687       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
688       /* the 64th bit in qw is used as fence. it is set by hardware in streamout stats event. */
689       query->result_size = 32 * SI_MAX_STREAMS;
690       query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
691       break;
692    case PIPE_QUERY_PIPELINE_STATISTICS:
693       query->result_size = si_query_pipestats_num_results(sscreen) * 16;
694       query->result_size += 8; /* for the fence + alignment */
695       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
696       query->index = index;
697       if ((index == PIPE_STAT_QUERY_GS_PRIMITIVES || index == PIPE_STAT_QUERY_GS_INVOCATIONS) &&
698           sscreen->use_ngg && (sscreen->info.gfx_level >= GFX10 && sscreen->info.gfx_level <= GFX10_3))
699          query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
700 
701       /* GFX11 only emulates PIPE_STAT_QUERY_GS_PRIMITIVES because the shader culls,
702        * which makes the statistic incorrect.
703        */
704       if (sscreen->info.gfx_level >= GFX11 && index == PIPE_STAT_QUERY_GS_PRIMITIVES)
705          query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
706       break;
707    default:
708       assert(0);
709       FREE(query);
710       return NULL;
711    }
712 
713    return (struct pipe_query *)query;
714 }
715 
si_update_occlusion_query_state(struct si_context * sctx,unsigned type,int diff)716 static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
717 {
718    if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
719        type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
720       switch (type) {
721       case PIPE_QUERY_OCCLUSION_COUNTER:
722          sctx->num_integer_occlusion_queries += diff;
723          break;
724       case PIPE_QUERY_OCCLUSION_PREDICATE:
725          sctx->num_boolean_occlusion_queries += diff;
726          break;
727       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
728          sctx->num_conservative_occlusion_queries += diff;
729          break;
730       }
731 
732       assert(sctx->num_integer_occlusion_queries >= 0);
733       assert(sctx->num_boolean_occlusion_queries >= 0);
734       assert(sctx->num_conservative_occlusion_queries >= 0);
735 
736       enum si_occlusion_query_mode new_mode =
737          sctx->num_integer_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER :
738          sctx->num_boolean_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN :
739          sctx->num_conservative_occlusion_queries ? SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN :
740          SI_OCCLUSION_QUERY_MODE_DISABLE;
741 
742       /* Conservative queries are only available on gfx10+. On gfx11+, they perform worse
743        * with late Z, but not early Z. Instead of trying to detect late Z, never enable
744        * conservative queries to keep it simple. This is the recommended programming.
745        */
746       if (new_mode == SI_OCCLUSION_QUERY_MODE_CONSERVATIVE_BOOLEAN &&
747           (sctx->gfx_level < GFX10 || sctx ->gfx_level >= GFX11))
748          new_mode = SI_OCCLUSION_QUERY_MODE_PRECISE_BOOLEAN;
749 
750       if (sctx->occlusion_query_mode != new_mode) {
751          si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
752 
753          if (sctx->screen->info.has_out_of_order_rast &&
754              (sctx->occlusion_query_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER) !=
755              (new_mode == SI_OCCLUSION_QUERY_MODE_PRECISE_INTEGER))
756             si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
757 
758          sctx->occlusion_query_mode = new_mode;
759       }
760    }
761 }
762 
event_type_for_stream(unsigned stream)763 static unsigned event_type_for_stream(unsigned stream)
764 {
765    switch (stream) {
766    default:
767    case 0:
768       return V_028A90_SAMPLE_STREAMOUTSTATS;
769    case 1:
770       return V_028A90_SAMPLE_STREAMOUTSTATS1;
771    case 2:
772       return V_028A90_SAMPLE_STREAMOUTSTATS2;
773    case 3:
774       return V_028A90_SAMPLE_STREAMOUTSTATS3;
775    }
776 }
777 
emit_sample_streamout(struct radeon_cmdbuf * cs,uint64_t va,unsigned stream)778 static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
779 {
780    radeon_begin(cs);
781    radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
782    radeon_emit(EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
783    radeon_emit(va);
784    radeon_emit(va >> 32);
785    radeon_end();
786 }
787 
si_query_hw_do_emit_start(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)788 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
789                                       struct si_resource *buffer, uint64_t va)
790 {
791    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
792 
793    switch (query->b.type) {
794    case PIPE_QUERY_OCCLUSION_COUNTER:
795    case PIPE_QUERY_OCCLUSION_PREDICATE:
796    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
797       radeon_begin(cs);
798       if (sctx->gfx_level >= GFX11 &&
799           sctx->screen->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
800          radeon_emit(PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
801       } else {
802          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
803          if (sctx->gfx_level >= GFX11)
804             radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
805          else
806             radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
807       }
808       radeon_emit(va);
809       radeon_emit(va >> 32);
810       radeon_end();
811       break;
812    }
813    case PIPE_QUERY_PRIMITIVES_EMITTED:
814    case PIPE_QUERY_PRIMITIVES_GENERATED:
815    case PIPE_QUERY_SO_STATISTICS:
816    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
817       emit_sample_streamout(cs, va, query->stream);
818       break;
819    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
820       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
821          emit_sample_streamout(cs, va + 32 * stream, stream);
822       break;
823    case PIPE_QUERY_TIME_ELAPSED:
824       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
825                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
826       break;
827    case PIPE_QUERY_PIPELINE_STATISTICS: {
828       if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
829          /* The hw GS primitive counter doesn't work when ngg is active.
830           * So if use_ngg is true, we don't use the hw version but instead
831           * emulate it in the GS shader.
832           * The value is written at the same position, so we don't need to
833           * change anything else.
834           * If ngg is enabled for the draw, the primitive count is written in
835           * gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of exported
836           * vertices is stored in gs_emitted_vertices and the number of prim
837           * is computed based on the output prim type in emit_gs_epilogue.
838           */
839          struct pipe_shader_buffer sbuf;
840          sbuf.buffer = &buffer->b.b;
841          sbuf.buffer_offset = query->buffer.results_end;
842          sbuf.buffer_size = buffer->bo_size - sbuf.buffer_offset;
843          si_set_internal_shader_buffer(sctx, SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf);
844          SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 1);
845 
846          const uint32_t zero = 0;
847          radeon_begin(cs);
848          /* Clear the emulated counter end value. We don't clear start because it's unused. */
849          va += si_query_pipestat_end_dw_offset(sctx->screen, query->index) * 4;
850          radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0));
851          radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
852          radeon_emit(va);
853          radeon_emit(va >> 32);
854          radeon_emit(zero);
855          radeon_end();
856 
857          sctx->num_pipeline_stat_emulated_queries++;
858       } else {
859          radeon_begin(cs);
860          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
861          radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
862          radeon_emit(va);
863          radeon_emit(va >> 32);
864          radeon_end();
865       }
866       break;
867    }
868    default:
869       assert(0);
870    }
871    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
872                              RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
873 }
874 
si_update_hw_pipeline_stats(struct si_context * sctx,unsigned type,int diff)875 static void si_update_hw_pipeline_stats(struct si_context *sctx, unsigned type, int diff)
876 {
877    if (type == PIPE_QUERY_PIPELINE_STATISTICS ||
878        /* All streamout queries: */
879        type == PIPE_QUERY_PRIMITIVES_GENERATED ||
880        type == PIPE_QUERY_PRIMITIVES_EMITTED ||
881        type == PIPE_QUERY_SO_STATISTICS ||
882        type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
883        type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
884       if (type == PIPE_QUERY_PIPELINE_STATISTICS)
885          sctx->num_pipeline_stat_queries += diff;
886 
887       /* Increment for pipeline statistics and streamout queries. */
888       sctx->num_hw_pipestat_streamout_queries += diff;
889 
890       /* Enable/disable pipeline stats if we have any queries. */
891       if (diff == 1 && sctx->num_hw_pipestat_streamout_queries == 1) {
892          sctx->barrier_flags &= ~SI_BARRIER_EVENT_PIPELINESTAT_STOP;
893          sctx->barrier_flags |= SI_BARRIER_EVENT_PIPELINESTAT_START;
894          si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
895       } else if (diff == -1 && sctx->num_hw_pipestat_streamout_queries == 0) {
896          sctx->barrier_flags &= ~SI_BARRIER_EVENT_PIPELINESTAT_START;
897          sctx->barrier_flags |= SI_BARRIER_EVENT_PIPELINESTAT_STOP;
898          si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
899       }
900    }
901 }
902 
si_query_hw_emit_start(struct si_context * sctx,struct si_query_hw * query)903 static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
904 {
905    uint64_t va;
906 
907    if (!query->buffer.buf && query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
908       si_resource_reference(&query->buffer.buf, sctx->pipeline_stats_query_buf);
909 
910    /* Don't realloc pipeline_stats_query_buf */
911    if ((!(query->flags & SI_QUERY_EMULATE_GS_COUNTERS) || !sctx->pipeline_stats_query_buf) &&
912        !si_query_buffer_alloc(sctx, &query->buffer, si_query_hw_prepare_buffer, query->result_size))
913       return;
914 
915    if (query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
916       si_resource_reference(&sctx->pipeline_stats_query_buf, query->buffer.buf);
917 
918    si_update_occlusion_query_state(sctx, query->b.type, 1);
919    si_update_prims_generated_query_state(sctx, query->b.type, 1);
920    si_update_hw_pipeline_stats(sctx, query->b.type, 1);
921 
922    si_need_gfx_cs_space(sctx, 0);
923 
924    va = query->buffer.buf->gpu_address + query->buffer.results_end;
925    si_query_hw_do_emit_start(sctx, query, query->buffer.buf, va);
926 }
927 
si_query_hw_do_emit_stop(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)928 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
929                                      struct si_resource *buffer, uint64_t va)
930 {
931    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
932    uint64_t fence_va = 0;
933 
934    switch (query->b.type) {
935    case PIPE_QUERY_OCCLUSION_COUNTER:
936    case PIPE_QUERY_OCCLUSION_PREDICATE:
937    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
938       fence_va = va + sctx->screen->info.max_render_backends * 16;
939       va += 8;
940       radeon_begin(cs);
941       if (sctx->gfx_level >= GFX11 &&
942           sctx->screen->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
943          radeon_emit(PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
944       } else {
945          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
946          if (sctx->gfx_level >= GFX11)
947             radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
948          else
949             radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
950       }
951       radeon_emit(va);
952       radeon_emit(va >> 32);
953       radeon_end();
954       break;
955    }
956    case PIPE_QUERY_PRIMITIVES_EMITTED:
957    case PIPE_QUERY_PRIMITIVES_GENERATED:
958    case PIPE_QUERY_SO_STATISTICS:
959    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
960       va += 16;
961       emit_sample_streamout(cs, va, query->stream);
962       break;
963    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
964       va += 16;
965       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
966          emit_sample_streamout(cs, va + 32 * stream, stream);
967       break;
968    case PIPE_QUERY_TIME_ELAPSED:
969       va += 8;
970       FALLTHROUGH;
971    case PIPE_QUERY_TIMESTAMP:
972       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
973                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
974       fence_va = va + 8;
975       break;
976    case PIPE_QUERY_PIPELINE_STATISTICS: {
977       unsigned sample_size = (query->result_size - 8) / 2;
978 
979       va += sample_size;
980       fence_va = va + sample_size;
981 
982       radeon_begin(cs);
983       if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
984          radeon_event_write(V_028A90_VS_PARTIAL_FLUSH);
985 
986          if (--sctx->num_pipeline_stat_emulated_queries == 0) {
987             si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
988             SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 0);
989          }
990       } else {
991          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
992          radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
993          radeon_emit(va);
994          radeon_emit(va >> 32);
995       }
996       radeon_end();
997       break;
998    }
999    default:
1000       assert(0);
1001    }
1002    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
1003                              RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
1004 
1005    if (fence_va) {
1006       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
1007                         EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
1008                         query->b.type);
1009    }
1010 }
1011 
si_query_hw_emit_stop(struct si_context * sctx,struct si_query_hw * query)1012 static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
1013 {
1014    uint64_t va;
1015 
1016    /* The queries which need begin already called this in begin_query. */
1017    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1018       si_need_gfx_cs_space(sctx, 0);
1019       if (!si_query_buffer_alloc(sctx, &query->buffer, si_query_hw_prepare_buffer,
1020                                  query->result_size))
1021          return;
1022    }
1023 
1024    if (!query->buffer.buf)
1025       return; // previous buffer allocation failure
1026 
1027    /* emit end query */
1028    va = query->buffer.buf->gpu_address + query->buffer.results_end;
1029 
1030    si_query_hw_do_emit_stop(sctx, query, query->buffer.buf, va);
1031 
1032    query->buffer.results_end += query->result_size;
1033 
1034    si_update_occlusion_query_state(sctx, query->b.type, -1);
1035    si_update_prims_generated_query_state(sctx, query->b.type, -1);
1036    si_update_hw_pipeline_stats(sctx, query->b.type, -1);
1037 }
1038 
emit_set_predicate(struct si_context * ctx,struct si_resource * buf,uint64_t va,uint32_t op)1039 static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
1040                                uint32_t op)
1041 {
1042    struct radeon_cmdbuf *cs = &ctx->gfx_cs;
1043 
1044    radeon_begin(cs);
1045 
1046    if (ctx->gfx_level >= GFX9) {
1047       radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
1048       radeon_emit(op);
1049       radeon_emit(va);
1050       radeon_emit(va >> 32);
1051    } else {
1052       radeon_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
1053       radeon_emit(va);
1054       radeon_emit(op | ((va >> 32) & 0xFF));
1055    }
1056    radeon_end();
1057 
1058    radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ | RADEON_PRIO_QUERY);
1059 }
1060 
si_emit_query_predication(struct si_context * ctx,unsigned index)1061 static void si_emit_query_predication(struct si_context *ctx, unsigned index)
1062 {
1063    uint32_t op;
1064    bool flag_wait, invert;
1065 
1066    struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
1067    if (!query)
1068       return;
1069 
1070    invert = ctx->render_cond_invert;
1071    flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
1072                ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
1073 
1074    if (ctx->gfx_level >= GFX11 && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1075                                    query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
1076       struct gfx11_sh_query *gfx10_query = (struct gfx11_sh_query *)query;
1077       struct gfx11_sh_query_buffer *qbuf, *first, *last;
1078 
1079       op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1080 
1081       /* if true then invert, see GL_ARB_conditional_render_inverted */
1082       if (!invert)
1083          op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1084       else
1085          op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1086 
1087       op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1088 
1089       first = gfx10_query->first;
1090       last = gfx10_query->last;
1091 
1092       while (first) {
1093          qbuf = first;
1094          if (first != last)
1095             first = list_entry(qbuf->list.next, struct gfx11_sh_query_buffer, list);
1096          else
1097             first = NULL;
1098 
1099          unsigned results_base = gfx10_query->first_begin;
1100          uint64_t va_base = qbuf->buf->gpu_address;
1101          uint64_t va = va_base + results_base;
1102 
1103          unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
1104          unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
1105 
1106          unsigned count = (end - begin) / sizeof(struct gfx11_sh_query_buffer_mem);
1107          do {
1108             if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1109                for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1110                   emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
1111 
1112                   /* set CONTINUE bit for all packets except the first */
1113                   op |= PREDICATION_CONTINUE;
1114                }
1115             } else {
1116                emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
1117                op |= PREDICATION_CONTINUE;
1118             }
1119 
1120             results_base += sizeof(struct gfx11_sh_query_buffer_mem);
1121          } while (count--);
1122       }
1123    } else {
1124       struct si_query_buffer *qbuf;
1125 
1126       if (query->workaround_buf) {
1127          op = PRED_OP(PREDICATION_OP_BOOL64);
1128       } else {
1129          switch (query->b.type) {
1130          case PIPE_QUERY_OCCLUSION_COUNTER:
1131          case PIPE_QUERY_OCCLUSION_PREDICATE:
1132          case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1133             op = PRED_OP(PREDICATION_OP_ZPASS);
1134             break;
1135          case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1136          case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1137             op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1138             invert = !invert;
1139             break;
1140          default:
1141             assert(0);
1142             return;
1143          }
1144       }
1145 
1146       /* if true then invert, see GL_ARB_conditional_render_inverted */
1147       if (invert)
1148          op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1149       else
1150          op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1151 
1152       /* Use the value written by compute shader as a workaround. Note that
1153        * the wait flag does not apply in this predication mode.
1154        *
1155        * The shader outputs the result value to L2. Workarounds only affect GFX8
1156        * and later, where the CP reads data from L2, so we don't need an
1157        * additional flush.
1158        */
1159       if (query->workaround_buf) {
1160          uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1161          emit_set_predicate(ctx, query->workaround_buf, va, op);
1162          return;
1163       }
1164 
1165       op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1166 
1167       /* emit predicate packets for all data blocks */
1168       for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1169          unsigned results_base = 0;
1170          uint64_t va_base = qbuf->buf->gpu_address;
1171 
1172          while (results_base < qbuf->results_end) {
1173             uint64_t va = va_base + results_base;
1174 
1175             if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1176                for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1177                   emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1178 
1179                   /* set CONTINUE bit for all packets except the first */
1180                   op |= PREDICATION_CONTINUE;
1181                }
1182             } else {
1183                emit_set_predicate(ctx, qbuf->buf, va, op);
1184                op |= PREDICATION_CONTINUE;
1185             }
1186 
1187             results_base += query->result_size;
1188          }
1189       }
1190    }
1191 }
1192 
si_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)1193 static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1194                                           unsigned index)
1195 {
1196    struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1197 
1198    if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1199        (query_type >= PIPE_QUERY_DRIVER_SPECIFIC))
1200       return si_query_sw_create(query_type);
1201 
1202    if (sscreen->info.gfx_level >= GFX11 &&
1203        (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1204         query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1205         query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1206         query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1207       return gfx11_sh_query_create(sscreen, query_type, index);
1208 
1209    return si_query_hw_create(sscreen, query_type, index);
1210 }
1211 
si_destroy_query(struct pipe_context * ctx,struct pipe_query * query)1212 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1213 {
1214    struct si_context *sctx = (struct si_context *)ctx;
1215    struct si_query *squery = (struct si_query *)query;
1216 
1217    squery->ops->destroy(sctx, squery);
1218 }
1219 
si_begin_query(struct pipe_context * ctx,struct pipe_query * query)1220 static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1221 {
1222    struct si_context *sctx = (struct si_context *)ctx;
1223    struct si_query *squery = (struct si_query *)query;
1224 
1225    return squery->ops->begin(sctx, squery);
1226 }
1227 
si_query_hw_begin(struct si_context * sctx,struct si_query * squery)1228 static bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1229 {
1230    struct si_query_hw *query = (struct si_query_hw *)squery;
1231 
1232    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1233       assert(0);
1234       return false;
1235    }
1236 
1237    if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1238       si_query_buffer_reset(sctx, &query->buffer);
1239 
1240    si_resource_reference(&query->workaround_buf, NULL);
1241 
1242    si_query_hw_emit_start(sctx, query);
1243    if (!query->buffer.buf)
1244       return false;
1245 
1246    list_addtail(&query->b.active_list, &sctx->active_queries);
1247    sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1248    return true;
1249 }
1250 
si_end_query(struct pipe_context * ctx,struct pipe_query * query)1251 static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1252 {
1253    struct si_context *sctx = (struct si_context *)ctx;
1254    struct si_query *squery = (struct si_query *)query;
1255 
1256    return squery->ops->end(sctx, squery);
1257 }
1258 
si_query_hw_end(struct si_context * sctx,struct si_query * squery)1259 static bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1260 {
1261    struct si_query_hw *query = (struct si_query_hw *)squery;
1262 
1263    if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1264       si_query_buffer_reset(sctx, &query->buffer);
1265 
1266    si_query_hw_emit_stop(sctx, query);
1267 
1268    if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1269       list_delinit(&query->b.active_list);
1270       sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1271    }
1272 
1273    if (!query->buffer.buf)
1274       return false;
1275 
1276    return true;
1277 }
1278 
si_get_hw_query_result_shader_params(struct si_context * sctx,struct si_query_hw * squery,int index,struct si_hw_query_params * params)1279 static void si_get_hw_query_result_shader_params(struct si_context *sctx,
1280                                                  struct si_query_hw *squery, int index,
1281                                                  struct si_hw_query_params *params)
1282 {
1283    unsigned max_rbs = sctx->screen->info.max_render_backends;
1284 
1285    params->pair_stride = 0;
1286    params->pair_count = 1;
1287 
1288    switch (squery->b.type) {
1289    case PIPE_QUERY_OCCLUSION_COUNTER:
1290    case PIPE_QUERY_OCCLUSION_PREDICATE:
1291    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1292       params->start_offset = 0;
1293       params->end_offset = 8;
1294       params->fence_offset = max_rbs * 16;
1295       params->pair_stride = 16;
1296       params->pair_count = max_rbs;
1297       break;
1298    case PIPE_QUERY_TIME_ELAPSED:
1299       params->start_offset = 0;
1300       params->end_offset = 8;
1301       params->fence_offset = 16;
1302       break;
1303    case PIPE_QUERY_TIMESTAMP:
1304       params->start_offset = 0;
1305       params->end_offset = 0;
1306       params->fence_offset = 8;
1307       break;
1308    case PIPE_QUERY_PRIMITIVES_EMITTED:
1309       params->start_offset = 8;
1310       params->end_offset = 24;
1311       params->fence_offset = params->end_offset + 4;
1312       break;
1313    case PIPE_QUERY_PRIMITIVES_GENERATED:
1314       params->start_offset = 0;
1315       params->end_offset = 16;
1316       params->fence_offset = params->end_offset + 4;
1317       break;
1318    case PIPE_QUERY_SO_STATISTICS:
1319       params->start_offset = 8 - index * 8;
1320       params->end_offset = 24 - index * 8;
1321       params->fence_offset = params->end_offset + 4;
1322       break;
1323    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1324       params->pair_count = SI_MAX_STREAMS;
1325       params->pair_stride = 32;
1326       FALLTHROUGH;
1327    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1328       params->start_offset = 0;
1329       params->end_offset = 16;
1330 
1331       /* We can re-use the high dword of the last 64-bit value as a
1332        * fence: it is initialized as 0, and the high bit is set by
1333        * the write of the streamout stats event.
1334        */
1335       params->fence_offset = squery->result_size - 4;
1336       break;
1337    case PIPE_QUERY_PIPELINE_STATISTICS: {
1338       params->start_offset = si_query_pipestat_dw_offset(index) * 4;
1339       params->end_offset = si_query_pipestat_end_dw_offset(sctx->screen, index) * 4;
1340       params->fence_offset = si_query_pipestats_num_results(sctx->screen) * 16;
1341       break;
1342    }
1343    default:
1344       unreachable("si_get_hw_query_params unsupported");
1345    }
1346 }
1347 
si_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1348 static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1349                                      bool test_status_bit)
1350 {
1351    uint32_t *current_result = (uint32_t *)map;
1352    uint64_t start, end;
1353 
1354    start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1355    end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1356 
1357    if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1358       return end - start;
1359    }
1360    return 0;
1361 }
1362 
si_query_hw_add_result(struct si_screen * sscreen,struct si_query_hw * query,void * buffer,union pipe_query_result * result)1363 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1364                                    void *buffer, union pipe_query_result *result)
1365 {
1366    unsigned max_rbs = sscreen->info.max_render_backends;
1367 
1368    switch (query->b.type) {
1369    case PIPE_QUERY_OCCLUSION_COUNTER: {
1370       for (unsigned i = 0; i < max_rbs; ++i) {
1371          unsigned results_base = i * 16;
1372          result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1373       }
1374       break;
1375    }
1376    case PIPE_QUERY_OCCLUSION_PREDICATE:
1377    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1378       for (unsigned i = 0; i < max_rbs; ++i) {
1379          unsigned results_base = i * 16;
1380          result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1381       }
1382       break;
1383    }
1384    case PIPE_QUERY_TIME_ELAPSED:
1385       result->u64 += si_query_read_result(buffer, 0, 2, false);
1386       break;
1387    case PIPE_QUERY_TIMESTAMP:
1388       result->u64 = *(uint64_t *)buffer;
1389       break;
1390    case PIPE_QUERY_PRIMITIVES_EMITTED:
1391       /* SAMPLE_STREAMOUTSTATS stores this structure:
1392        * {
1393        *    u64 NumPrimitivesWritten;
1394        *    u64 PrimitiveStorageNeeded;
1395        * }
1396        * We only need NumPrimitivesWritten here. */
1397       result->u64 += si_query_read_result(buffer, 2, 6, true);
1398       break;
1399    case PIPE_QUERY_PRIMITIVES_GENERATED:
1400       /* Here we read PrimitiveStorageNeeded. */
1401       result->u64 += si_query_read_result(buffer, 0, 4, true);
1402       break;
1403    case PIPE_QUERY_SO_STATISTICS:
1404       result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1405       result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1406       break;
1407    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1408       result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1409                                   si_query_read_result(buffer, 0, 4, true);
1410       break;
1411    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1412       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1413          result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1414                                      si_query_read_result(buffer, 0, 4, true);
1415          buffer = (char *)buffer + 32;
1416       }
1417       break;
1418    case PIPE_QUERY_PIPELINE_STATISTICS:
1419       for (int i = 0; i < 11; i++) {
1420          result->pipeline_statistics.counters[i] +=
1421             si_query_read_result(buffer, si_query_pipestat_dw_offset(i),
1422                                  si_query_pipestat_end_dw_offset(sscreen, i), false);
1423       }
1424 #if 0 /* for testing */
1425       printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1426              "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1427              "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1428              result->pipeline_statistics.ia_vertices,
1429              result->pipeline_statistics.ia_primitives,
1430              result->pipeline_statistics.vs_invocations,
1431              result->pipeline_statistics.hs_invocations,
1432              result->pipeline_statistics.ds_invocations,
1433              result->pipeline_statistics.gs_invocations,
1434              result->pipeline_statistics.gs_primitives,
1435              result->pipeline_statistics.c_invocations,
1436              result->pipeline_statistics.c_primitives,
1437              result->pipeline_statistics.ps_invocations,
1438              result->pipeline_statistics.cs_invocations);
1439 #endif
1440       break;
1441    default:
1442       assert(0);
1443    }
1444 }
1445 
si_query_hw_suspend(struct si_context * sctx,struct si_query * query)1446 static void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1447 {
1448    si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1449 }
1450 
si_query_hw_resume(struct si_context * sctx,struct si_query * query)1451 static void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1452 {
1453    si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1454 }
1455 
si_get_query_result(struct pipe_context * ctx,struct pipe_query * query,bool wait,union pipe_query_result * result)1456 static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1457                                 union pipe_query_result *result)
1458 {
1459    struct si_context *sctx = (struct si_context *)ctx;
1460    struct si_query *squery = (struct si_query *)query;
1461 
1462    return squery->ops->get_result(sctx, squery, wait, result);
1463 }
1464 
si_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1465 static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1466                                          enum pipe_query_flags flags, enum pipe_query_value_type result_type,
1467                                          int index, struct pipe_resource *resource, unsigned offset)
1468 {
1469    struct si_context *sctx = (struct si_context *)ctx;
1470    struct si_query *squery = (struct si_query *)query;
1471 
1472    squery->ops->get_result_resource(sctx, squery, flags, result_type, index, resource, offset);
1473 }
1474 
si_query_hw_clear_result(struct si_query_hw * query,union pipe_query_result * result)1475 static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1476 {
1477    util_query_clear_result(result, query->b.type);
1478 }
1479 
si_query_hw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1480 static bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1481                                    union pipe_query_result *result)
1482 {
1483    struct si_screen *sscreen = sctx->screen;
1484    struct si_query_hw *query = (struct si_query_hw *)squery;
1485    struct si_query_buffer *qbuf;
1486 
1487    si_query_hw_clear_result(query, result);
1488 
1489    for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1490       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1491       unsigned results_base = 0;
1492       void *map;
1493 
1494       if (squery->b.flushed)
1495          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
1496       else
1497          map = si_buffer_map(sctx, qbuf->buf, usage);
1498 
1499       if (!map)
1500          return false;
1501 
1502       while (results_base != qbuf->results_end) {
1503          si_query_hw_add_result(sscreen, query, map + results_base, result);
1504          results_base += query->result_size;
1505       }
1506    }
1507 
1508    /* Convert the time to expected units. */
1509    if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
1510        squery->type == PIPE_QUERY_TIMESTAMP) {
1511       result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1512    }
1513    return true;
1514 }
1515 
si_query_hw_get_result_resource(struct si_context * sctx,struct si_query * squery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1516 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1517                                             enum pipe_query_flags flags,
1518                                             enum pipe_query_value_type result_type,
1519                                             int index, struct pipe_resource *resource,
1520                                             unsigned offset)
1521 {
1522    struct si_query_hw *query = (struct si_query_hw *)squery;
1523    struct si_query_buffer *qbuf;
1524    struct si_query_buffer *qbuf_prev;
1525    struct pipe_resource *tmp_buffer = NULL;
1526    unsigned tmp_buffer_offset = 0;
1527    struct si_qbo_state saved_state = {};
1528    struct pipe_grid_info grid = {};
1529    struct pipe_constant_buffer constant_buffer = {};
1530    struct pipe_shader_buffer ssbo[3];
1531    struct si_hw_query_params params;
1532    struct {
1533       uint32_t end_offset;
1534       uint32_t result_stride;
1535       uint32_t result_count;
1536       uint32_t config;
1537       uint32_t fence_offset;
1538       uint32_t pair_stride;
1539       uint32_t pair_count;
1540    } consts;
1541 
1542    if (!sctx->query_result_shader) {
1543       sctx->query_result_shader = si_create_query_result_cs(sctx);
1544       if (!sctx->query_result_shader)
1545          return;
1546    }
1547 
1548    if (query->buffer.previous) {
1549       u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1550       if (!tmp_buffer)
1551          return;
1552    }
1553 
1554    si_save_qbo_state(sctx, &saved_state);
1555 
1556    si_get_hw_query_result_shader_params(sctx, query, index >= 0 ? index : 0, &params);
1557    consts.end_offset = params.end_offset - params.start_offset;
1558    consts.fence_offset = params.fence_offset - params.start_offset;
1559    consts.result_stride = query->result_size;
1560    consts.pair_stride = params.pair_stride;
1561    consts.pair_count = params.pair_count;
1562 
1563    constant_buffer.buffer_size = sizeof(consts);
1564    constant_buffer.user_buffer = &consts;
1565 
1566    ssbo[1].buffer = tmp_buffer;
1567    ssbo[1].buffer_offset = tmp_buffer_offset;
1568    ssbo[1].buffer_size = 16;
1569 
1570    ssbo[2] = ssbo[1];
1571 
1572    grid.block[0] = 1;
1573    grid.block[1] = 1;
1574    grid.block[2] = 1;
1575    grid.grid[0] = 1;
1576    grid.grid[1] = 1;
1577    grid.grid[2] = 1;
1578 
1579    consts.config = 0;
1580    if (index < 0)
1581       consts.config |= 4;
1582    if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1583        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1584       consts.config |= 8;
1585    else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1586             query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1587       consts.config |= 8 | 256;
1588    else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1589       consts.config |= 32;
1590 
1591    switch (result_type) {
1592    case PIPE_QUERY_TYPE_U64:
1593    case PIPE_QUERY_TYPE_I64:
1594       consts.config |= 64;
1595       break;
1596    case PIPE_QUERY_TYPE_I32:
1597       consts.config |= 128;
1598       break;
1599    case PIPE_QUERY_TYPE_U32:
1600       break;
1601    }
1602 
1603    sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM |
1604                           (sctx->gfx_level <= GFX8 ? SI_BARRIER_INV_L2 : 0);
1605    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
1606 
1607    for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1608       if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1609          qbuf_prev = qbuf->previous;
1610          consts.result_count = qbuf->results_end / query->result_size;
1611          consts.config &= ~3;
1612          if (qbuf != &query->buffer)
1613             consts.config |= 1;
1614          if (qbuf->previous)
1615             consts.config |= 2;
1616       } else {
1617          /* Only read the last timestamp. */
1618          qbuf_prev = NULL;
1619          consts.result_count = 0;
1620          consts.config |= 16;
1621          params.start_offset += qbuf->results_end - query->result_size;
1622       }
1623 
1624       sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
1625 
1626       ssbo[0].buffer = &qbuf->buf->b.b;
1627       ssbo[0].buffer_offset = params.start_offset;
1628       ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1629 
1630       if (!qbuf->previous) {
1631          ssbo[2].buffer = resource;
1632          ssbo[2].buffer_offset = offset;
1633          ssbo[2].buffer_size = resource->width0 - offset;
1634       }
1635 
1636       if ((flags & PIPE_QUERY_WAIT) && qbuf == &query->buffer) {
1637          uint64_t va;
1638 
1639          /* Wait for result availability. Wait only for readiness
1640           * of the last entry, since the fence writes should be
1641           * serialized in the CP.
1642           */
1643          va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1644          va += params.fence_offset;
1645 
1646          si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1647       }
1648 
1649       unsigned writable_bitmask = 0x4;
1650 
1651       si_barrier_before_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
1652       si_launch_grid_internal_ssbos(sctx, &grid, sctx->query_result_shader,
1653                                     3, ssbo, writable_bitmask, false);
1654       si_barrier_after_internal_op(sctx, 0, 3, ssbo, writable_bitmask, 0, NULL);
1655    }
1656 
1657    si_restore_qbo_state(sctx, &saved_state);
1658    pipe_resource_reference(&tmp_buffer, NULL);
1659 }
1660 
si_render_condition(struct pipe_context * ctx,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1661 static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1662                                 enum pipe_render_cond_flag mode)
1663 {
1664    struct si_context *sctx = (struct si_context *)ctx;
1665    struct si_query_hw *squery = (struct si_query_hw *)query;
1666    struct si_atom *atom = &sctx->atoms.s.render_cond;
1667 
1668    if (query) {
1669       bool needs_workaround = false;
1670 
1671       /* There was a firmware regression in GFX8 which causes successive
1672        * SET_PREDICATION packets to give the wrong answer for
1673        * non-inverted stream overflow predication.
1674        */
1675       if (((sctx->gfx_level == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1676            (sctx->gfx_level == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1677           !condition &&
1678           (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1679            (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1680             (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1681          needs_workaround = true;
1682       }
1683 
1684       if (needs_workaround && !squery->workaround_buf) {
1685          bool old_render_cond_enabled = sctx->render_cond_enabled;
1686          sctx->render_cond_enabled = false;
1687 
1688          u_suballocator_alloc(&sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1689                               (struct pipe_resource **)&squery->workaround_buf);
1690 
1691          /* Reset to NULL to avoid a redundant SET_PREDICATION
1692           * from launching the compute grid.
1693           */
1694          sctx->render_cond = NULL;
1695 
1696          ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1697                                         &squery->workaround_buf->b.b, squery->workaround_offset);
1698 
1699          /* Settings this in the render cond atom is too late,
1700           * so set it here. */
1701          if (sctx->gfx_level <= GFX8) {
1702             sctx->barrier_flags |= SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME;
1703             si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
1704          }
1705 
1706          sctx->render_cond_enabled = old_render_cond_enabled;
1707       }
1708    }
1709 
1710    sctx->render_cond = query;
1711    sctx->render_cond_invert = condition;
1712    sctx->render_cond_mode = mode;
1713    sctx->render_cond_enabled = query;
1714 
1715    si_set_atom_dirty(sctx, atom, query != NULL);
1716 }
1717 
si_suspend_queries(struct si_context * sctx)1718 void si_suspend_queries(struct si_context *sctx)
1719 {
1720    struct si_query *query;
1721 
1722    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1723       query->ops->suspend(sctx, query);
1724 }
1725 
si_resume_queries(struct si_context * sctx)1726 void si_resume_queries(struct si_context *sctx)
1727 {
1728    struct si_query *query;
1729 
1730    /* Check CS space here. Resuming must not be interrupted by flushes. */
1731    si_need_gfx_cs_space(sctx, 0);
1732 
1733    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1734       query->ops->resume(sctx, query);
1735 }
1736 
1737 #define XFULL(name_, query_type_, type_, result_type_, group_id_)                                  \
1738    {                                                                                               \
1739       .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1740       .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_           \
1741    }
1742 
1743 #define X(name_, query_type_, type_, result_type_)                                                 \
1744    XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1745 
1746 #define XG(group_, name_, query_type_, type_, result_type_)                                        \
1747    XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1748 
1749 static struct pipe_driver_query_info si_driver_query_list[] = {
1750    X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1751    X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1752    X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1753    X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1754    X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1755    X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1756    X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1757    X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1758    X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1759    X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1760    X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1761    X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1762    X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1763    X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1764    X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1765    X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1766    X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1767    X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1768    X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1769    X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1770    X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1771    X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1772    X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1773    X("slab-wasted-VRAM", SLAB_WASTED_VRAM, BYTES, AVERAGE),
1774    X("slab-wasted-GTT", SLAB_WASTED_GTT, BYTES, AVERAGE),
1775    X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1776    X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1777    X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1778    X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1779    X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1780    X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1781    X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1782    X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1783    X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1784    X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1785    X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1786    X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1787    X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1788    X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1789    X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1790    X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1791    X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1792    X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1793 
1794    /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1795     * which use it as a fallback path to detect the GPU type.
1796     *
1797     * Note: The names of these queries are significant for GPUPerfStudio
1798     * (and possibly their order as well). */
1799    XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1800    XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1801    XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1802    XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1803    XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1804 
1805    X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1806    X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1807    X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1808 
1809    /* The following queries must be at the end of the list because their
1810     * availability is adjusted dynamically based on the DRM version. */
1811    X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1812    X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1813    X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1814    X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1815    X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1816    X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1817    X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1818    X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1819    X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1820    X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1821    X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1822    X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1823    X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1824    X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1825 
1826    /* SRBM_STATUS2 */
1827    X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1828 
1829    /* CP_STAT */
1830    X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1831    X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1832    X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1833    X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1834    X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1835    X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1836 };
1837 
1838 #undef X
1839 #undef XG
1840 #undef XFULL
1841 
si_get_num_queries(struct si_screen * sscreen)1842 static unsigned si_get_num_queries(struct si_screen *sscreen)
1843 {
1844    /* amdgpu */
1845    if (sscreen->info.is_amdgpu) {
1846       if (sscreen->info.gfx_level >= GFX8)
1847          return ARRAY_SIZE(si_driver_query_list);
1848       else
1849          return ARRAY_SIZE(si_driver_query_list) - 7;
1850    }
1851 
1852    /* radeon */
1853    if (sscreen->info.gfx_level == GFX7)
1854       return ARRAY_SIZE(si_driver_query_list) - 6;
1855    else
1856       return ARRAY_SIZE(si_driver_query_list) - 7;
1857 
1858    return ARRAY_SIZE(si_driver_query_list) - 21;
1859 }
1860 
si_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)1861 static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1862                                     struct pipe_driver_query_info *info)
1863 {
1864    struct si_screen *sscreen = (struct si_screen *)screen;
1865    unsigned num_queries = si_get_num_queries(sscreen);
1866 
1867    if (!info) {
1868       unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1869 
1870       return num_queries + num_perfcounters;
1871    }
1872 
1873    if (index >= num_queries)
1874       return si_get_perfcounter_info(sscreen, index - num_queries, info);
1875 
1876    *info = si_driver_query_list[index];
1877 
1878    switch (info->query_type) {
1879    case SI_QUERY_REQUESTED_VRAM:
1880    case SI_QUERY_VRAM_USAGE:
1881    case SI_QUERY_MAPPED_VRAM:
1882    case SI_QUERY_SLAB_WASTED_VRAM:
1883       info->max_value.u64 = (uint64_t)sscreen->info.vram_size_kb * 1024;
1884       break;
1885    case SI_QUERY_REQUESTED_GTT:
1886    case SI_QUERY_GTT_USAGE:
1887    case SI_QUERY_MAPPED_GTT:
1888    case SI_QUERY_SLAB_WASTED_GTT:
1889       info->max_value.u64 = (uint64_t)sscreen->info.gart_size_kb * 1024;
1890       break;
1891    case SI_QUERY_GPU_TEMPERATURE:
1892       info->max_value.u64 = 125;
1893       break;
1894    case SI_QUERY_VRAM_VIS_USAGE:
1895       info->max_value.u64 = (uint64_t)sscreen->info.vram_vis_size_kb * 1024;
1896       break;
1897    }
1898 
1899    if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1900       info->group_id += sscreen->perfcounters->base.num_groups;
1901 
1902    return 1;
1903 }
1904 
1905 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1906  * performance counter groups, so be careful when changing this and related
1907  * functions.
1908  */
si_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1909 static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1910                                           struct pipe_driver_query_group_info *info)
1911 {
1912    struct si_screen *sscreen = (struct si_screen *)screen;
1913    unsigned num_pc_groups = 0;
1914 
1915    if (sscreen->perfcounters)
1916       num_pc_groups = sscreen->perfcounters->base.num_groups;
1917 
1918    if (!info)
1919       return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1920 
1921    if (index < num_pc_groups)
1922       return si_get_perfcounter_group_info(sscreen, index, info);
1923 
1924    index -= num_pc_groups;
1925    if (index >= SI_NUM_SW_QUERY_GROUPS)
1926       return 0;
1927 
1928    info->name = "GPIN";
1929    info->max_active_queries = 5;
1930    info->num_queries = 5;
1931    return 1;
1932 }
1933 
1934 static const struct si_query_ops hw_query_ops = {
1935    .destroy = si_query_hw_destroy,
1936    .begin = si_query_hw_begin,
1937    .end = si_query_hw_end,
1938    .get_result = si_query_hw_get_result,
1939    .get_result_resource = si_query_hw_get_result_resource,
1940 
1941    .suspend = si_query_hw_suspend,
1942    .resume = si_query_hw_resume,
1943 };
1944 
1945 static const struct si_query_ops sw_query_ops = {
1946    .destroy = si_query_sw_destroy,
1947    .begin = si_query_sw_begin,
1948    .end = si_query_sw_end,
1949    .get_result = si_query_sw_get_result,
1950    .get_result_resource = NULL
1951 };
1952 
si_init_query_functions(struct si_context * sctx)1953 void si_init_query_functions(struct si_context *sctx)
1954 {
1955    sctx->b.create_query = si_create_query;
1956    sctx->b.create_batch_query = si_create_batch_query;
1957    sctx->b.destroy_query = si_destroy_query;
1958    sctx->b.begin_query = si_begin_query;
1959    sctx->b.end_query = si_end_query;
1960    sctx->b.get_query_result = si_get_query_result;
1961    sctx->b.get_query_result_resource = si_get_query_result_resource;
1962 
1963    if (sctx->has_graphics) {
1964       sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1965       sctx->b.render_condition = si_render_condition;
1966    }
1967 
1968    list_inithead(&sctx->active_queries);
1969 }
1970 
si_init_screen_query_functions(struct si_screen * sscreen)1971 void si_init_screen_query_functions(struct si_screen *sscreen)
1972 {
1973    sscreen->b.get_driver_query_info = si_get_driver_query_info;
1974    sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1975 }
1976