• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4  * Copyright 2018 Advanced Micro Devices, Inc.
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * on the rights to use, copy, modify, merge, publish, distribute, sub
11  * license, and/or sell copies of the Software, and to permit persons to whom
12  * the Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24  * USE OR OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "si_query.h"
28 #include "si_build_pm4.h"
29 
30 #include "amd/common/sid.h"
31 #include "si_pipe.h"
32 #include "util/os_time.h"
33 #include "util/u_memory.h"
34 #include "util/u_suballoc.h"
35 #include "util/u_upload_mgr.h"
36 
37 static const struct si_query_ops query_hw_ops;
38 
39 struct si_hw_query_params {
40    unsigned start_offset;
41    unsigned end_offset;
42    unsigned fence_offset;
43    unsigned pair_stride;
44    unsigned pair_count;
45 };
46 
47 /* Queries without buffer handling or suspend/resume. */
48 struct si_query_sw {
49    struct si_query b;
50 
51    uint64_t begin_result;
52    uint64_t end_result;
53 
54    uint64_t begin_time;
55    uint64_t end_time;
56 
57    /* Fence for GPU_FINISHED. */
58    struct pipe_fence_handle *fence;
59 };
60 
si_query_sw_destroy(struct si_context * sctx,struct si_query * squery)61 static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
62 {
63    struct si_query_sw *query = (struct si_query_sw *)squery;
64 
65    sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
66    FREE(query);
67 }
68 
winsys_id_from_type(unsigned type)69 static enum radeon_value_id winsys_id_from_type(unsigned type)
70 {
71    switch (type) {
72    case SI_QUERY_REQUESTED_VRAM:
73       return RADEON_REQUESTED_VRAM_MEMORY;
74    case SI_QUERY_REQUESTED_GTT:
75       return RADEON_REQUESTED_GTT_MEMORY;
76    case SI_QUERY_MAPPED_VRAM:
77       return RADEON_MAPPED_VRAM;
78    case SI_QUERY_MAPPED_GTT:
79       return RADEON_MAPPED_GTT;
80    case SI_QUERY_SLAB_WASTED_VRAM:
81       return RADEON_SLAB_WASTED_VRAM;
82    case SI_QUERY_SLAB_WASTED_GTT:
83       return RADEON_SLAB_WASTED_GTT;
84    case SI_QUERY_BUFFER_WAIT_TIME:
85       return RADEON_BUFFER_WAIT_TIME_NS;
86    case SI_QUERY_NUM_MAPPED_BUFFERS:
87       return RADEON_NUM_MAPPED_BUFFERS;
88    case SI_QUERY_NUM_GFX_IBS:
89       return RADEON_NUM_GFX_IBS;
90    case SI_QUERY_GFX_BO_LIST_SIZE:
91       return RADEON_GFX_BO_LIST_COUNTER;
92    case SI_QUERY_GFX_IB_SIZE:
93       return RADEON_GFX_IB_SIZE_COUNTER;
94    case SI_QUERY_NUM_BYTES_MOVED:
95       return RADEON_NUM_BYTES_MOVED;
96    case SI_QUERY_NUM_EVICTIONS:
97       return RADEON_NUM_EVICTIONS;
98    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
99       return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
100    case SI_QUERY_VRAM_USAGE:
101       return RADEON_VRAM_USAGE;
102    case SI_QUERY_VRAM_VIS_USAGE:
103       return RADEON_VRAM_VIS_USAGE;
104    case SI_QUERY_GTT_USAGE:
105       return RADEON_GTT_USAGE;
106    case SI_QUERY_GPU_TEMPERATURE:
107       return RADEON_GPU_TEMPERATURE;
108    case SI_QUERY_CURRENT_GPU_SCLK:
109       return RADEON_CURRENT_SCLK;
110    case SI_QUERY_CURRENT_GPU_MCLK:
111       return RADEON_CURRENT_MCLK;
112    case SI_QUERY_CS_THREAD_BUSY:
113       return RADEON_CS_THREAD_TIME;
114    default:
115       unreachable("query type does not correspond to winsys id");
116    }
117 }
118 
si_query_sw_begin(struct si_context * sctx,struct si_query * squery)119 static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
120 {
121    struct si_query_sw *query = (struct si_query_sw *)squery;
122    enum radeon_value_id ws_id;
123 
124    switch (query->b.type) {
125    case PIPE_QUERY_TIMESTAMP_DISJOINT:
126    case PIPE_QUERY_GPU_FINISHED:
127       break;
128    case SI_QUERY_DRAW_CALLS:
129       query->begin_result = sctx->num_draw_calls;
130       break;
131    case SI_QUERY_DECOMPRESS_CALLS:
132       query->begin_result = sctx->num_decompress_calls;
133       break;
134    case SI_QUERY_PRIM_RESTART_CALLS:
135       query->begin_result = sctx->num_prim_restart_calls;
136       break;
137    case SI_QUERY_COMPUTE_CALLS:
138       query->begin_result = sctx->num_compute_calls;
139       break;
140    case SI_QUERY_CP_DMA_CALLS:
141       query->begin_result = sctx->num_cp_dma_calls;
142       break;
143    case SI_QUERY_NUM_VS_FLUSHES:
144       query->begin_result = sctx->num_vs_flushes;
145       break;
146    case SI_QUERY_NUM_PS_FLUSHES:
147       query->begin_result = sctx->num_ps_flushes;
148       break;
149    case SI_QUERY_NUM_CS_FLUSHES:
150       query->begin_result = sctx->num_cs_flushes;
151       break;
152    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
153       query->begin_result = sctx->num_cb_cache_flushes;
154       break;
155    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
156       query->begin_result = sctx->num_db_cache_flushes;
157       break;
158    case SI_QUERY_NUM_L2_INVALIDATES:
159       query->begin_result = sctx->num_L2_invalidates;
160       break;
161    case SI_QUERY_NUM_L2_WRITEBACKS:
162       query->begin_result = sctx->num_L2_writebacks;
163       break;
164    case SI_QUERY_NUM_RESIDENT_HANDLES:
165       query->begin_result = sctx->num_resident_handles;
166       break;
167    case SI_QUERY_TC_OFFLOADED_SLOTS:
168       query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
169       break;
170    case SI_QUERY_TC_DIRECT_SLOTS:
171       query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
172       break;
173    case SI_QUERY_TC_NUM_SYNCS:
174       query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
175       break;
176    case SI_QUERY_REQUESTED_VRAM:
177    case SI_QUERY_REQUESTED_GTT:
178    case SI_QUERY_MAPPED_VRAM:
179    case SI_QUERY_MAPPED_GTT:
180    case SI_QUERY_SLAB_WASTED_VRAM:
181    case SI_QUERY_SLAB_WASTED_GTT:
182    case SI_QUERY_VRAM_USAGE:
183    case SI_QUERY_VRAM_VIS_USAGE:
184    case SI_QUERY_GTT_USAGE:
185    case SI_QUERY_GPU_TEMPERATURE:
186    case SI_QUERY_CURRENT_GPU_SCLK:
187    case SI_QUERY_CURRENT_GPU_MCLK:
188    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
189    case SI_QUERY_NUM_MAPPED_BUFFERS:
190       query->begin_result = 0;
191       break;
192    case SI_QUERY_BUFFER_WAIT_TIME:
193    case SI_QUERY_GFX_IB_SIZE:
194    case SI_QUERY_NUM_GFX_IBS:
195    case SI_QUERY_NUM_BYTES_MOVED:
196    case SI_QUERY_NUM_EVICTIONS:
197    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
198       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
199       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
200       break;
201    }
202    case SI_QUERY_GFX_BO_LIST_SIZE:
203       ws_id = winsys_id_from_type(query->b.type);
204       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
205       query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
206       break;
207    case SI_QUERY_CS_THREAD_BUSY:
208       ws_id = winsys_id_from_type(query->b.type);
209       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
210       query->begin_time = os_time_get_nano();
211       break;
212    case SI_QUERY_GALLIUM_THREAD_BUSY:
213       query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
214       query->begin_time = os_time_get_nano();
215       break;
216    case SI_QUERY_GPU_LOAD:
217    case SI_QUERY_GPU_SHADERS_BUSY:
218    case SI_QUERY_GPU_TA_BUSY:
219    case SI_QUERY_GPU_GDS_BUSY:
220    case SI_QUERY_GPU_VGT_BUSY:
221    case SI_QUERY_GPU_IA_BUSY:
222    case SI_QUERY_GPU_SX_BUSY:
223    case SI_QUERY_GPU_WD_BUSY:
224    case SI_QUERY_GPU_BCI_BUSY:
225    case SI_QUERY_GPU_SC_BUSY:
226    case SI_QUERY_GPU_PA_BUSY:
227    case SI_QUERY_GPU_DB_BUSY:
228    case SI_QUERY_GPU_CP_BUSY:
229    case SI_QUERY_GPU_CB_BUSY:
230    case SI_QUERY_GPU_SDMA_BUSY:
231    case SI_QUERY_GPU_PFP_BUSY:
232    case SI_QUERY_GPU_MEQ_BUSY:
233    case SI_QUERY_GPU_ME_BUSY:
234    case SI_QUERY_GPU_SURF_SYNC_BUSY:
235    case SI_QUERY_GPU_CP_DMA_BUSY:
236    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
237       query->begin_result = si_begin_counter(sctx->screen, query->b.type);
238       break;
239    case SI_QUERY_NUM_COMPILATIONS:
240       query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
241       break;
242    case SI_QUERY_NUM_SHADERS_CREATED:
243       query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
244       break;
245    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
246       query->begin_result = sctx->screen->live_shader_cache.hits;
247       break;
248    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
249       query->begin_result = sctx->screen->live_shader_cache.misses;
250       break;
251    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
252       query->begin_result = sctx->screen->num_memory_shader_cache_hits;
253       break;
254    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
255       query->begin_result = sctx->screen->num_memory_shader_cache_misses;
256       break;
257    case SI_QUERY_DISK_SHADER_CACHE_HITS:
258       query->begin_result = sctx->screen->num_disk_shader_cache_hits;
259       break;
260    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
261       query->begin_result = sctx->screen->num_disk_shader_cache_misses;
262       break;
263    case SI_QUERY_GPIN_ASIC_ID:
264    case SI_QUERY_GPIN_NUM_SIMD:
265    case SI_QUERY_GPIN_NUM_RB:
266    case SI_QUERY_GPIN_NUM_SPI:
267    case SI_QUERY_GPIN_NUM_SE:
268       break;
269    default:
270       unreachable("si_query_sw_begin: bad query type");
271    }
272 
273    return true;
274 }
275 
si_query_sw_end(struct si_context * sctx,struct si_query * squery)276 static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
277 {
278    struct si_query_sw *query = (struct si_query_sw *)squery;
279    enum radeon_value_id ws_id;
280 
281    switch (query->b.type) {
282    case PIPE_QUERY_TIMESTAMP_DISJOINT:
283       break;
284    case PIPE_QUERY_GPU_FINISHED:
285       sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
286       break;
287    case SI_QUERY_DRAW_CALLS:
288       query->end_result = sctx->num_draw_calls;
289       break;
290    case SI_QUERY_DECOMPRESS_CALLS:
291       query->end_result = sctx->num_decompress_calls;
292       break;
293    case SI_QUERY_PRIM_RESTART_CALLS:
294       query->end_result = sctx->num_prim_restart_calls;
295       break;
296    case SI_QUERY_COMPUTE_CALLS:
297       query->end_result = sctx->num_compute_calls;
298       break;
299    case SI_QUERY_CP_DMA_CALLS:
300       query->end_result = sctx->num_cp_dma_calls;
301       break;
302    case SI_QUERY_NUM_VS_FLUSHES:
303       query->end_result = sctx->num_vs_flushes;
304       break;
305    case SI_QUERY_NUM_PS_FLUSHES:
306       query->end_result = sctx->num_ps_flushes;
307       break;
308    case SI_QUERY_NUM_CS_FLUSHES:
309       query->end_result = sctx->num_cs_flushes;
310       break;
311    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
312       query->end_result = sctx->num_cb_cache_flushes;
313       break;
314    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
315       query->end_result = sctx->num_db_cache_flushes;
316       break;
317    case SI_QUERY_NUM_L2_INVALIDATES:
318       query->end_result = sctx->num_L2_invalidates;
319       break;
320    case SI_QUERY_NUM_L2_WRITEBACKS:
321       query->end_result = sctx->num_L2_writebacks;
322       break;
323    case SI_QUERY_NUM_RESIDENT_HANDLES:
324       query->end_result = sctx->num_resident_handles;
325       break;
326    case SI_QUERY_TC_OFFLOADED_SLOTS:
327       query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
328       break;
329    case SI_QUERY_TC_DIRECT_SLOTS:
330       query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
331       break;
332    case SI_QUERY_TC_NUM_SYNCS:
333       query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
334       break;
335    case SI_QUERY_REQUESTED_VRAM:
336    case SI_QUERY_REQUESTED_GTT:
337    case SI_QUERY_MAPPED_VRAM:
338    case SI_QUERY_MAPPED_GTT:
339    case SI_QUERY_SLAB_WASTED_VRAM:
340    case SI_QUERY_SLAB_WASTED_GTT:
341    case SI_QUERY_VRAM_USAGE:
342    case SI_QUERY_VRAM_VIS_USAGE:
343    case SI_QUERY_GTT_USAGE:
344    case SI_QUERY_GPU_TEMPERATURE:
345    case SI_QUERY_CURRENT_GPU_SCLK:
346    case SI_QUERY_CURRENT_GPU_MCLK:
347    case SI_QUERY_BUFFER_WAIT_TIME:
348    case SI_QUERY_GFX_IB_SIZE:
349    case SI_QUERY_NUM_MAPPED_BUFFERS:
350    case SI_QUERY_NUM_GFX_IBS:
351    case SI_QUERY_NUM_BYTES_MOVED:
352    case SI_QUERY_NUM_EVICTIONS:
353    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
354       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
355       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
356       break;
357    }
358    case SI_QUERY_GFX_BO_LIST_SIZE:
359       ws_id = winsys_id_from_type(query->b.type);
360       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
361       query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
362       break;
363    case SI_QUERY_CS_THREAD_BUSY:
364       ws_id = winsys_id_from_type(query->b.type);
365       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
366       query->end_time = os_time_get_nano();
367       break;
368    case SI_QUERY_GALLIUM_THREAD_BUSY:
369       query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
370       query->end_time = os_time_get_nano();
371       break;
372    case SI_QUERY_GPU_LOAD:
373    case SI_QUERY_GPU_SHADERS_BUSY:
374    case SI_QUERY_GPU_TA_BUSY:
375    case SI_QUERY_GPU_GDS_BUSY:
376    case SI_QUERY_GPU_VGT_BUSY:
377    case SI_QUERY_GPU_IA_BUSY:
378    case SI_QUERY_GPU_SX_BUSY:
379    case SI_QUERY_GPU_WD_BUSY:
380    case SI_QUERY_GPU_BCI_BUSY:
381    case SI_QUERY_GPU_SC_BUSY:
382    case SI_QUERY_GPU_PA_BUSY:
383    case SI_QUERY_GPU_DB_BUSY:
384    case SI_QUERY_GPU_CP_BUSY:
385    case SI_QUERY_GPU_CB_BUSY:
386    case SI_QUERY_GPU_SDMA_BUSY:
387    case SI_QUERY_GPU_PFP_BUSY:
388    case SI_QUERY_GPU_MEQ_BUSY:
389    case SI_QUERY_GPU_ME_BUSY:
390    case SI_QUERY_GPU_SURF_SYNC_BUSY:
391    case SI_QUERY_GPU_CP_DMA_BUSY:
392    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
393       query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
394       query->begin_result = 0;
395       break;
396    case SI_QUERY_NUM_COMPILATIONS:
397       query->end_result = p_atomic_read(&sctx->screen->num_compilations);
398       break;
399    case SI_QUERY_NUM_SHADERS_CREATED:
400       query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
401       break;
402    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
403       query->end_result = sctx->last_tex_ps_draw_ratio;
404       break;
405    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
406       query->end_result = sctx->screen->live_shader_cache.hits;
407       break;
408    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
409       query->end_result = sctx->screen->live_shader_cache.misses;
410       break;
411    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
412       query->end_result = sctx->screen->num_memory_shader_cache_hits;
413       break;
414    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
415       query->end_result = sctx->screen->num_memory_shader_cache_misses;
416       break;
417    case SI_QUERY_DISK_SHADER_CACHE_HITS:
418       query->end_result = sctx->screen->num_disk_shader_cache_hits;
419       break;
420    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
421       query->end_result = sctx->screen->num_disk_shader_cache_misses;
422       break;
423    case SI_QUERY_GPIN_ASIC_ID:
424    case SI_QUERY_GPIN_NUM_SIMD:
425    case SI_QUERY_GPIN_NUM_RB:
426    case SI_QUERY_GPIN_NUM_SPI:
427    case SI_QUERY_GPIN_NUM_SE:
428       break;
429    default:
430       unreachable("si_query_sw_end: bad query type");
431    }
432 
433    return true;
434 }
435 
si_query_sw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)436 static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
437                                    union pipe_query_result *result)
438 {
439    struct si_query_sw *query = (struct si_query_sw *)squery;
440 
441    switch (query->b.type) {
442    case PIPE_QUERY_TIMESTAMP_DISJOINT:
443       /* Convert from cycles per millisecond to cycles per second (Hz). */
444       result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
445       result->timestamp_disjoint.disjoint = false;
446       return true;
447    case PIPE_QUERY_GPU_FINISHED: {
448       struct pipe_screen *screen = sctx->b.screen;
449       struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
450 
451       result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
452       return result->b;
453    }
454 
455    case SI_QUERY_GFX_BO_LIST_SIZE:
456       result->u64 =
457          (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
458       return true;
459    case SI_QUERY_CS_THREAD_BUSY:
460    case SI_QUERY_GALLIUM_THREAD_BUSY:
461       result->u64 =
462          (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
463       return true;
464    case SI_QUERY_GPIN_ASIC_ID:
465       result->u32 = 0;
466       return true;
467    case SI_QUERY_GPIN_NUM_SIMD:
468       result->u32 = sctx->screen->info.num_cu;
469       return true;
470    case SI_QUERY_GPIN_NUM_RB:
471       result->u32 = sctx->screen->info.max_render_backends;
472       return true;
473    case SI_QUERY_GPIN_NUM_SPI:
474       result->u32 = 1; /* all supported chips have one SPI per SE */
475       return true;
476    case SI_QUERY_GPIN_NUM_SE:
477       result->u32 = sctx->screen->info.max_se;
478       return true;
479    }
480 
481    result->u64 = query->end_result - query->begin_result;
482 
483    switch (query->b.type) {
484    case SI_QUERY_BUFFER_WAIT_TIME:
485    case SI_QUERY_GPU_TEMPERATURE:
486       result->u64 /= 1000;
487       break;
488    case SI_QUERY_CURRENT_GPU_SCLK:
489    case SI_QUERY_CURRENT_GPU_MCLK:
490       result->u64 *= 1000000;
491       break;
492    }
493 
494    return true;
495 }
496 
497 static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
498                                                  .begin = si_query_sw_begin,
499                                                  .end = si_query_sw_end,
500                                                  .get_result = si_query_sw_get_result,
501                                                  .get_result_resource = NULL};
502 
si_query_sw_create(unsigned query_type)503 static struct pipe_query *si_query_sw_create(unsigned query_type)
504 {
505    struct si_query_sw *query;
506 
507    query = CALLOC_STRUCT(si_query_sw);
508    if (!query)
509       return NULL;
510 
511    query->b.type = query_type;
512    query->b.ops = &sw_query_ops;
513 
514    return (struct pipe_query *)query;
515 }
516 
si_query_buffer_destroy(struct si_screen * sscreen,struct si_query_buffer * buffer)517 void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
518 {
519    struct si_query_buffer *prev = buffer->previous;
520 
521    /* Release all query buffers. */
522    while (prev) {
523       struct si_query_buffer *qbuf = prev;
524       prev = prev->previous;
525       si_resource_reference(&qbuf->buf, NULL);
526       FREE(qbuf);
527    }
528 
529    si_resource_reference(&buffer->buf, NULL);
530 }
531 
si_query_buffer_reset(struct si_context * sctx,struct si_query_buffer * buffer)532 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
533 {
534    /* Discard all query buffers except for the oldest. */
535    while (buffer->previous) {
536       struct si_query_buffer *qbuf = buffer->previous;
537       buffer->previous = qbuf->previous;
538 
539       si_resource_reference(&buffer->buf, NULL);
540       buffer->buf = qbuf->buf; /* move ownership */
541       FREE(qbuf);
542    }
543    buffer->results_end = 0;
544 
545    if (!buffer->buf)
546       return;
547 
548    /* Discard even the oldest buffer if it can't be mapped without a stall. */
549    if (si_cs_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
550        !sctx->ws->buffer_wait(sctx->ws, buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
551       si_resource_reference(&buffer->buf, NULL);
552    } else {
553       buffer->unprepared = true;
554    }
555 }
556 
si_query_buffer_alloc(struct si_context * sctx,struct si_query_buffer * buffer,bool (* prepare_buffer)(struct si_context *,struct si_query_buffer *),unsigned size)557 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
558                            bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
559                            unsigned size)
560 {
561    bool unprepared = buffer->unprepared;
562    buffer->unprepared = false;
563 
564    if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
565       if (buffer->buf) {
566          struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
567          memcpy(qbuf, buffer, sizeof(*qbuf));
568          buffer->previous = qbuf;
569       }
570       buffer->results_end = 0;
571 
572       /* Queries are normally read by the CPU after
573        * being written by the gpu, hence staging is probably a good
574        * usage pattern.
575        */
576       struct si_screen *screen = sctx->screen;
577       unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
578       buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
579       if (unlikely(!buffer->buf))
580          return false;
581       unprepared = true;
582    }
583 
584    if (unprepared && prepare_buffer) {
585       if (unlikely(!prepare_buffer(sctx, buffer))) {
586          si_resource_reference(&buffer->buf, NULL);
587          return false;
588       }
589    }
590 
591    return true;
592 }
593 
si_query_hw_destroy(struct si_context * sctx,struct si_query * squery)594 void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
595 {
596    struct si_query_hw *query = (struct si_query_hw *)squery;
597 
598    si_query_buffer_destroy(sctx->screen, &query->buffer);
599    si_resource_reference(&query->workaround_buf, NULL);
600    FREE(squery);
601 }
602 
si_query_hw_prepare_buffer(struct si_context * sctx,struct si_query_buffer * qbuf)603 static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
604 {
605    struct si_query_hw *query = container_of(qbuf, struct si_query_hw, buffer);
606    struct si_screen *screen = sctx->screen;
607 
608    /* The caller ensures that the buffer is currently unused by the GPU. */
609    uint32_t *results = screen->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL,
610                                               PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
611    if (!results)
612       return false;
613 
614    memset(results, 0, qbuf->buf->b.b.width0);
615 
616    if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
617        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
618        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
619       unsigned max_rbs = screen->info.max_render_backends;
620       unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
621       unsigned num_results;
622       unsigned i, j;
623 
624       /* Set top bits for unused backends. */
625       num_results = qbuf->buf->b.b.width0 / query->result_size;
626       for (j = 0; j < num_results; j++) {
627          for (i = 0; i < max_rbs; i++) {
628             if (!(enabled_rb_mask & (1 << i))) {
629                results[(i * 4) + 1] = 0x80000000;
630                results[(i * 4) + 3] = 0x80000000;
631             }
632          }
633          results += 4 * max_rbs;
634       }
635    }
636 
637    return true;
638 }
639 
si_query_pipestats_num_results(struct si_screen * sscreen)640 static unsigned si_query_pipestats_num_results(struct si_screen *sscreen)
641 {
642    return sscreen->info.gfx_level >= GFX11 ? 14 : 11;
643 }
644 
si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)645 static unsigned si_query_pipestat_dw_offset(enum pipe_statistics_query_index index)
646 {
647    switch (index) {
648    case PIPE_STAT_QUERY_PS_INVOCATIONS: return 0;
649    case PIPE_STAT_QUERY_C_PRIMITIVES: return 2;
650    case PIPE_STAT_QUERY_C_INVOCATIONS: return 4;
651    case PIPE_STAT_QUERY_VS_INVOCATIONS: return 6;
652    case PIPE_STAT_QUERY_GS_INVOCATIONS: return 8;
653    case PIPE_STAT_QUERY_GS_PRIMITIVES: return 10;
654    case PIPE_STAT_QUERY_IA_PRIMITIVES: return 12;
655    case PIPE_STAT_QUERY_IA_VERTICES: return 14;
656    case PIPE_STAT_QUERY_HS_INVOCATIONS: return 16;
657    case PIPE_STAT_QUERY_DS_INVOCATIONS: return 18;
658    case PIPE_STAT_QUERY_CS_INVOCATIONS: return 20;
659    /* gfx11: MS_INVOCATIONS */
660    /* gfx11: MS_PRIMITIVES */
661    /* gfx11: TS_INVOCATIONS */
662    default:
663       assert(false);
664    }
665    return ~0;
666 }
667 
si_query_pipestat_end_dw_offset(struct si_screen * sscreen,enum pipe_statistics_query_index index)668 unsigned si_query_pipestat_end_dw_offset(struct si_screen *sscreen,
669                                          enum pipe_statistics_query_index index)
670 {
671    return si_query_pipestats_num_results(sscreen) * 2 + si_query_pipestat_dw_offset(index);
672 }
673 
674 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
675                                             enum pipe_query_flags flags,
676                                             enum pipe_query_value_type result_type,
677                                             int index, struct pipe_resource *resource,
678                                             unsigned offset);
679 
680 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
681                                       struct si_resource *buffer, uint64_t va);
682 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
683                                      struct si_resource *buffer, uint64_t va);
684 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
685                                    union pipe_query_result *result);
686 static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
687 
688 static struct si_query_hw_ops query_hw_default_hw_ops = {
689    .prepare_buffer = si_query_hw_prepare_buffer,
690    .emit_start = si_query_hw_do_emit_start,
691    .emit_stop = si_query_hw_do_emit_stop,
692    .clear_result = si_query_hw_clear_result,
693    .add_result = si_query_hw_add_result,
694 };
695 
si_query_hw_create(struct si_screen * sscreen,unsigned query_type,unsigned index)696 static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
697                                              unsigned index)
698 {
699    struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
700    if (!query)
701       return NULL;
702 
703    query->b.type = query_type;
704    query->b.ops = &query_hw_ops;
705    query->ops = &query_hw_default_hw_ops;
706 
707    switch (query_type) {
708    case PIPE_QUERY_OCCLUSION_COUNTER:
709    case PIPE_QUERY_OCCLUSION_PREDICATE:
710    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
711       query->result_size = 16 * sscreen->info.max_render_backends;
712       query->result_size += 16; /* for the fence + alignment */
713       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
714       break;
715    case PIPE_QUERY_TIME_ELAPSED:
716       query->result_size = 24;
717       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
718       break;
719    case PIPE_QUERY_TIMESTAMP:
720       query->result_size = 16;
721       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
722       query->flags = SI_QUERY_HW_FLAG_NO_START;
723       break;
724    case PIPE_QUERY_PRIMITIVES_EMITTED:
725    case PIPE_QUERY_PRIMITIVES_GENERATED:
726    case PIPE_QUERY_SO_STATISTICS:
727    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
728       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
729       query->result_size = 32;
730       query->b.num_cs_dw_suspend = 6;
731       query->stream = index;
732       break;
733    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
734       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
735       query->result_size = 32 * SI_MAX_STREAMS;
736       query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
737       break;
738    case PIPE_QUERY_PIPELINE_STATISTICS:
739       query->result_size = si_query_pipestats_num_results(sscreen) * 16;
740       query->result_size += 8; /* for the fence + alignment */
741       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
742       query->index = index;
743       if ((index == PIPE_STAT_QUERY_GS_PRIMITIVES || index == PIPE_STAT_QUERY_GS_INVOCATIONS) &&
744           sscreen->use_ngg && (sscreen->info.gfx_level >= GFX10 && sscreen->info.gfx_level <= GFX10_3))
745          query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
746       break;
747    default:
748       assert(0);
749       FREE(query);
750       return NULL;
751    }
752 
753    return (struct pipe_query *)query;
754 }
755 
si_update_occlusion_query_state(struct si_context * sctx,unsigned type,int diff)756 static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
757 {
758    if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
759        type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
760       bool old_enable = sctx->num_occlusion_queries != 0;
761       bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
762       bool enable, perfect_enable;
763 
764       sctx->num_occlusion_queries += diff;
765       assert(sctx->num_occlusion_queries >= 0);
766 
767       if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
768          sctx->num_perfect_occlusion_queries += diff;
769          assert(sctx->num_perfect_occlusion_queries >= 0);
770       }
771 
772       enable = sctx->num_occlusion_queries != 0;
773       perfect_enable = sctx->num_perfect_occlusion_queries != 0;
774 
775       if (enable != old_enable || perfect_enable != old_perfect_enable) {
776          si_set_occlusion_query_state(sctx, old_perfect_enable);
777       }
778    }
779 }
780 
event_type_for_stream(unsigned stream)781 static unsigned event_type_for_stream(unsigned stream)
782 {
783    switch (stream) {
784    default:
785    case 0:
786       return V_028A90_SAMPLE_STREAMOUTSTATS;
787    case 1:
788       return V_028A90_SAMPLE_STREAMOUTSTATS1;
789    case 2:
790       return V_028A90_SAMPLE_STREAMOUTSTATS2;
791    case 3:
792       return V_028A90_SAMPLE_STREAMOUTSTATS3;
793    }
794 }
795 
emit_sample_streamout(struct radeon_cmdbuf * cs,uint64_t va,unsigned stream)796 static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
797 {
798    radeon_begin(cs);
799    radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
800    radeon_emit(EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
801    radeon_emit(va);
802    radeon_emit(va >> 32);
803    radeon_end();
804 }
805 
si_query_hw_do_emit_start(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)806 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
807                                       struct si_resource *buffer, uint64_t va)
808 {
809    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
810 
811    switch (query->b.type) {
812    case PIPE_QUERY_OCCLUSION_COUNTER:
813    case PIPE_QUERY_OCCLUSION_PREDICATE:
814    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
815       radeon_begin(cs);
816       if (sctx->gfx_level >= GFX11) {
817          uint64_t rb_mask = BITFIELD64_MASK(sctx->screen->info.max_render_backends);
818 
819          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
820          radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1));
821          radeon_emit(PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) |
822                      PIXEL_PIPE_STATE_CNTL_STRIDE(2) |
823                      PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask));
824          radeon_emit(PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask));
825       }
826 
827       radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
828       if (sctx->gfx_level >= GFX11)
829          radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
830       else
831          radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
832       radeon_emit(va);
833       radeon_emit(va >> 32);
834       radeon_end();
835       break;
836    }
837    case PIPE_QUERY_PRIMITIVES_EMITTED:
838    case PIPE_QUERY_PRIMITIVES_GENERATED:
839    case PIPE_QUERY_SO_STATISTICS:
840    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
841       emit_sample_streamout(cs, va, query->stream);
842       break;
843    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
844       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
845          emit_sample_streamout(cs, va + 32 * stream, stream);
846       break;
847    case PIPE_QUERY_TIME_ELAPSED:
848       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
849                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
850       break;
851    case PIPE_QUERY_PIPELINE_STATISTICS: {
852       if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
853          /* The hw GS primitive counter doesn't work when ngg is active.
854           * So if use_ngg is true, we don't use the hw version but instead
855           * emulate it in the GS shader.
856           * The value is written at the same position, so we don't need to
857           * change anything else.
858           * If ngg is enabled for the draw, the primitive count is written in
859           * gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of exported
860           * vertices is stored in gs_emitted_vertices and the number of prim
861           * is computed based on the output prim type in emit_gs_epilogue.
862           */
863          struct pipe_shader_buffer sbuf;
864          sbuf.buffer = &buffer->b.b;
865          sbuf.buffer_offset = query->buffer.results_end;
866          sbuf.buffer_size = buffer->bo_size;
867          si_set_internal_shader_buffer(sctx, SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf);
868          SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 1);
869 
870          const uint32_t zero = 0;
871          radeon_begin(cs);
872          /* Clear the emulated counter end value. We don't clear start because it's unused. */
873          va += si_query_pipestat_end_dw_offset(sctx->screen, query->index) * 4;
874          radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0));
875          radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
876          radeon_emit(va);
877          radeon_emit(va >> 32);
878          radeon_emit(zero);
879          radeon_end();
880 
881          sctx->num_pipeline_stat_emulated_queries++;
882       } else {
883          radeon_begin(cs);
884          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
885          radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
886          radeon_emit(va);
887          radeon_emit(va >> 32);
888          radeon_end();
889       }
890       break;
891    }
892    default:
893       assert(0);
894    }
895    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
896                              RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
897 }
898 
si_query_hw_emit_start(struct si_context * sctx,struct si_query_hw * query)899 static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
900 {
901    uint64_t va;
902 
903    if (!query->buffer.buf && query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
904       si_resource_reference(&query->buffer.buf, sctx->pipeline_stats_query_buf);
905 
906    /* Don't realloc pipeline_stats_query_buf */
907    if ((!(query->flags & SI_QUERY_EMULATE_GS_COUNTERS) || !sctx->pipeline_stats_query_buf) &&
908        !si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
909       return;
910 
911    if (query->flags & SI_QUERY_EMULATE_GS_COUNTERS)
912       si_resource_reference(&sctx->pipeline_stats_query_buf, query->buffer.buf);
913 
914    si_update_occlusion_query_state(sctx, query->b.type, 1);
915    si_update_prims_generated_query_state(sctx, query->b.type, 1);
916 
917    if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
918       sctx->num_pipeline_stat_queries++;
919 
920    si_need_gfx_cs_space(sctx, 0);
921 
922    va = query->buffer.buf->gpu_address + query->buffer.results_end;
923    query->ops->emit_start(sctx, query, query->buffer.buf, va);
924 }
925 
si_query_hw_do_emit_stop(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)926 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
927                                      struct si_resource *buffer, uint64_t va)
928 {
929    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
930    uint64_t fence_va = 0;
931 
932    switch (query->b.type) {
933    case PIPE_QUERY_OCCLUSION_COUNTER:
934    case PIPE_QUERY_OCCLUSION_PREDICATE:
935    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
936       va += 8;
937       radeon_begin(cs);
938       radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
939       if (sctx->gfx_level >= GFX11)
940          radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
941       else
942          radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
943       radeon_emit(va);
944       radeon_emit(va >> 32);
945       radeon_end();
946 
947       fence_va = va + sctx->screen->info.max_render_backends * 16 - 8;
948       break;
949    }
950    case PIPE_QUERY_PRIMITIVES_EMITTED:
951    case PIPE_QUERY_PRIMITIVES_GENERATED:
952    case PIPE_QUERY_SO_STATISTICS:
953    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
954       va += 16;
955       emit_sample_streamout(cs, va, query->stream);
956       break;
957    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
958       va += 16;
959       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
960          emit_sample_streamout(cs, va + 32 * stream, stream);
961       break;
962    case PIPE_QUERY_TIME_ELAPSED:
963       va += 8;
964       FALLTHROUGH;
965    case PIPE_QUERY_TIMESTAMP:
966       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
967                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
968       fence_va = va + 8;
969       break;
970    case PIPE_QUERY_PIPELINE_STATISTICS: {
971       unsigned sample_size = (query->result_size - 8) / 2;
972 
973       va += sample_size;
974 
975       radeon_begin(cs);
976       if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) {
977          radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
978          radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
979 
980          if (--sctx->num_pipeline_stat_emulated_queries == 0) {
981             si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
982             SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 0);
983          }
984       } else {
985          radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
986          radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
987          radeon_emit(va);
988          radeon_emit(va >> 32);
989       }
990       radeon_end();
991 
992       fence_va = va + sample_size;
993       break;
994    }
995    default:
996       assert(0);
997    }
998    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, query->buffer.buf,
999                              RADEON_USAGE_WRITE | RADEON_PRIO_QUERY);
1000 
1001    if (fence_va) {
1002       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
1003                         EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
1004                         query->b.type);
1005    }
1006 }
1007 
si_query_hw_emit_stop(struct si_context * sctx,struct si_query_hw * query)1008 static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
1009 {
1010    uint64_t va;
1011 
1012    /* The queries which need begin already called this in begin_query. */
1013    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1014       si_need_gfx_cs_space(sctx, 0);
1015       if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
1016                                  query->result_size))
1017          return;
1018    }
1019 
1020    if (!query->buffer.buf)
1021       return; // previous buffer allocation failure
1022 
1023    /* emit end query */
1024    va = query->buffer.buf->gpu_address + query->buffer.results_end;
1025 
1026    query->ops->emit_stop(sctx, query, query->buffer.buf, va);
1027 
1028    query->buffer.results_end += query->result_size;
1029 
1030    si_update_occlusion_query_state(sctx, query->b.type, -1);
1031    si_update_prims_generated_query_state(sctx, query->b.type, -1);
1032 
1033    if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
1034       sctx->num_pipeline_stat_queries--;
1035 }
1036 
emit_set_predicate(struct si_context * ctx,struct si_resource * buf,uint64_t va,uint32_t op)1037 static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
1038                                uint32_t op)
1039 {
1040    struct radeon_cmdbuf *cs = &ctx->gfx_cs;
1041 
1042    radeon_begin(cs);
1043 
1044    if (ctx->gfx_level >= GFX9) {
1045       radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
1046       radeon_emit(op);
1047       radeon_emit(va);
1048       radeon_emit(va >> 32);
1049    } else {
1050       radeon_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
1051       radeon_emit(va);
1052       radeon_emit(op | ((va >> 32) & 0xFF));
1053    }
1054    radeon_end();
1055 
1056    radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ | RADEON_PRIO_QUERY);
1057 }
1058 
si_emit_query_predication(struct si_context * ctx)1059 static void si_emit_query_predication(struct si_context *ctx)
1060 {
1061    uint32_t op;
1062    bool flag_wait, invert;
1063 
1064    struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
1065    if (!query)
1066       return;
1067 
1068    invert = ctx->render_cond_invert;
1069    flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
1070                ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
1071 
1072    if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1073                                           query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
1074       struct gfx10_sh_query *gfx10_query = (struct gfx10_sh_query *)query;
1075       struct gfx10_sh_query_buffer *qbuf, *first, *last;
1076 
1077       op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1078 
1079       /* if true then invert, see GL_ARB_conditional_render_inverted */
1080       if (!invert)
1081          op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1082       else
1083          op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1084 
1085       op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1086 
1087       first = gfx10_query->first;
1088       last = gfx10_query->last;
1089 
1090       while (first) {
1091          qbuf = first;
1092          if (first != last)
1093             first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list);
1094          else
1095             first = NULL;
1096 
1097          unsigned results_base = gfx10_query->first_begin;
1098          uint64_t va_base = qbuf->buf->gpu_address;
1099          uint64_t va = va_base + results_base;
1100 
1101          unsigned begin = qbuf == gfx10_query->first ? gfx10_query->first_begin : 0;
1102          unsigned end = qbuf == gfx10_query->last ? gfx10_query->last_end : qbuf->buf->b.b.width0;
1103 
1104          unsigned count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
1105          do {
1106             if (gfx10_query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1107                for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1108                   emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * stream, op);
1109 
1110                   /* set CONTINUE bit for all packets except the first */
1111                   op |= PREDICATION_CONTINUE;
1112                }
1113             } else {
1114                emit_set_predicate(ctx, qbuf->buf, va + 4 * sizeof(uint64_t) * gfx10_query->stream, op);
1115                op |= PREDICATION_CONTINUE;
1116             }
1117 
1118             results_base += sizeof(struct gfx10_sh_query_buffer_mem);
1119          } while (count--);
1120       }
1121    } else {
1122       struct si_query_buffer *qbuf;
1123 
1124       if (query->workaround_buf) {
1125          op = PRED_OP(PREDICATION_OP_BOOL64);
1126       } else {
1127          switch (query->b.type) {
1128          case PIPE_QUERY_OCCLUSION_COUNTER:
1129          case PIPE_QUERY_OCCLUSION_PREDICATE:
1130          case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1131             op = PRED_OP(PREDICATION_OP_ZPASS);
1132             break;
1133          case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1134          case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1135             op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1136             invert = !invert;
1137             break;
1138          default:
1139             assert(0);
1140             return;
1141          }
1142       }
1143 
1144       /* if true then invert, see GL_ARB_conditional_render_inverted */
1145       if (invert)
1146          op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1147       else
1148          op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1149 
1150       /* Use the value written by compute shader as a workaround. Note that
1151        * the wait flag does not apply in this predication mode.
1152        *
1153        * The shader outputs the result value to L2. Workarounds only affect GFX8
1154        * and later, where the CP reads data from L2, so we don't need an
1155        * additional flush.
1156        */
1157       if (query->workaround_buf) {
1158          uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1159          emit_set_predicate(ctx, query->workaround_buf, va, op);
1160          return;
1161       }
1162 
1163       op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1164 
1165       /* emit predicate packets for all data blocks */
1166       for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1167          unsigned results_base = 0;
1168          uint64_t va_base = qbuf->buf->gpu_address;
1169 
1170          while (results_base < qbuf->results_end) {
1171             uint64_t va = va_base + results_base;
1172 
1173             if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1174                for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1175                   emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1176 
1177                   /* set CONTINUE bit for all packets except the first */
1178                   op |= PREDICATION_CONTINUE;
1179                }
1180             } else {
1181                emit_set_predicate(ctx, qbuf->buf, va, op);
1182                op |= PREDICATION_CONTINUE;
1183             }
1184 
1185             results_base += query->result_size;
1186          }
1187       }
1188    }
1189 }
1190 
si_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)1191 static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1192                                           unsigned index)
1193 {
1194    struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1195 
1196    if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1197        (query_type >= PIPE_QUERY_DRIVER_SPECIFIC))
1198       return si_query_sw_create(query_type);
1199 
1200    if (sscreen->use_ngg_streamout &&
1201        (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1202         query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1203         query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1204         query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1205       return gfx10_sh_query_create(sscreen, query_type, index);
1206 
1207    return si_query_hw_create(sscreen, query_type, index);
1208 }
1209 
si_destroy_query(struct pipe_context * ctx,struct pipe_query * query)1210 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1211 {
1212    struct si_context *sctx = (struct si_context *)ctx;
1213    struct si_query *squery = (struct si_query *)query;
1214 
1215    squery->ops->destroy(sctx, squery);
1216 }
1217 
si_begin_query(struct pipe_context * ctx,struct pipe_query * query)1218 static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1219 {
1220    struct si_context *sctx = (struct si_context *)ctx;
1221    struct si_query *squery = (struct si_query *)query;
1222 
1223    return squery->ops->begin(sctx, squery);
1224 }
1225 
si_query_hw_begin(struct si_context * sctx,struct si_query * squery)1226 bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1227 {
1228    struct si_query_hw *query = (struct si_query_hw *)squery;
1229 
1230    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1231       assert(0);
1232       return false;
1233    }
1234 
1235    if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1236       si_query_buffer_reset(sctx, &query->buffer);
1237 
1238    si_resource_reference(&query->workaround_buf, NULL);
1239 
1240    si_query_hw_emit_start(sctx, query);
1241    if (!query->buffer.buf)
1242       return false;
1243 
1244    list_addtail(&query->b.active_list, &sctx->active_queries);
1245    sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1246    return true;
1247 }
1248 
si_end_query(struct pipe_context * ctx,struct pipe_query * query)1249 static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1250 {
1251    struct si_context *sctx = (struct si_context *)ctx;
1252    struct si_query *squery = (struct si_query *)query;
1253 
1254    return squery->ops->end(sctx, squery);
1255 }
1256 
si_query_hw_end(struct si_context * sctx,struct si_query * squery)1257 bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1258 {
1259    struct si_query_hw *query = (struct si_query_hw *)squery;
1260 
1261    if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1262       si_query_buffer_reset(sctx, &query->buffer);
1263 
1264    si_query_hw_emit_stop(sctx, query);
1265 
1266    if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1267       list_delinit(&query->b.active_list);
1268       sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1269    }
1270 
1271    if (!query->buffer.buf)
1272       return false;
1273 
1274    return true;
1275 }
1276 
si_get_hw_query_params(struct si_context * sctx,struct si_query_hw * squery,int index,struct si_hw_query_params * params)1277 static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
1278                                    struct si_hw_query_params *params)
1279 {
1280    unsigned max_rbs = sctx->screen->info.max_render_backends;
1281 
1282    params->pair_stride = 0;
1283    params->pair_count = 1;
1284 
1285    switch (squery->b.type) {
1286    case PIPE_QUERY_OCCLUSION_COUNTER:
1287    case PIPE_QUERY_OCCLUSION_PREDICATE:
1288    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1289       params->start_offset = 0;
1290       params->end_offset = 8;
1291       params->fence_offset = max_rbs * 16;
1292       params->pair_stride = 16;
1293       params->pair_count = max_rbs;
1294       break;
1295    case PIPE_QUERY_TIME_ELAPSED:
1296       params->start_offset = 0;
1297       params->end_offset = 8;
1298       params->fence_offset = 16;
1299       break;
1300    case PIPE_QUERY_TIMESTAMP:
1301       params->start_offset = 0;
1302       params->end_offset = 0;
1303       params->fence_offset = 8;
1304       break;
1305    case PIPE_QUERY_PRIMITIVES_EMITTED:
1306       params->start_offset = 8;
1307       params->end_offset = 24;
1308       params->fence_offset = params->end_offset + 4;
1309       break;
1310    case PIPE_QUERY_PRIMITIVES_GENERATED:
1311       params->start_offset = 0;
1312       params->end_offset = 16;
1313       params->fence_offset = params->end_offset + 4;
1314       break;
1315    case PIPE_QUERY_SO_STATISTICS:
1316       params->start_offset = 8 - index * 8;
1317       params->end_offset = 24 - index * 8;
1318       params->fence_offset = params->end_offset + 4;
1319       break;
1320    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1321       params->pair_count = SI_MAX_STREAMS;
1322       params->pair_stride = 32;
1323       FALLTHROUGH;
1324    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1325       params->start_offset = 0;
1326       params->end_offset = 16;
1327 
1328       /* We can re-use the high dword of the last 64-bit value as a
1329        * fence: it is initialized as 0, and the high bit is set by
1330        * the write of the streamout stats event.
1331        */
1332       params->fence_offset = squery->result_size - 4;
1333       break;
1334    case PIPE_QUERY_PIPELINE_STATISTICS: {
1335       params->start_offset = si_query_pipestat_dw_offset(index) * 4;
1336       params->end_offset = si_query_pipestat_end_dw_offset(sctx->screen, index) * 4;
1337       params->fence_offset = si_query_pipestats_num_results(sctx->screen) * 16;
1338       break;
1339    }
1340    default:
1341       unreachable("si_get_hw_query_params unsupported");
1342    }
1343 }
1344 
si_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1345 static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1346                                      bool test_status_bit)
1347 {
1348    uint32_t *current_result = (uint32_t *)map;
1349    uint64_t start, end;
1350 
1351    start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1352    end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1353 
1354    if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1355       return end - start;
1356    }
1357    return 0;
1358 }
1359 
si_query_hw_add_result(struct si_screen * sscreen,struct si_query_hw * query,void * buffer,union pipe_query_result * result)1360 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1361                                    void *buffer, union pipe_query_result *result)
1362 {
1363    unsigned max_rbs = sscreen->info.max_render_backends;
1364 
1365    switch (query->b.type) {
1366    case PIPE_QUERY_OCCLUSION_COUNTER: {
1367       for (unsigned i = 0; i < max_rbs; ++i) {
1368          unsigned results_base = i * 16;
1369          result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1370       }
1371       break;
1372    }
1373    case PIPE_QUERY_OCCLUSION_PREDICATE:
1374    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1375       for (unsigned i = 0; i < max_rbs; ++i) {
1376          unsigned results_base = i * 16;
1377          result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1378       }
1379       break;
1380    }
1381    case PIPE_QUERY_TIME_ELAPSED:
1382       result->u64 += si_query_read_result(buffer, 0, 2, false);
1383       break;
1384    case PIPE_QUERY_TIMESTAMP:
1385       result->u64 = *(uint64_t *)buffer;
1386       break;
1387    case PIPE_QUERY_PRIMITIVES_EMITTED:
1388       /* SAMPLE_STREAMOUTSTATS stores this structure:
1389        * {
1390        *    u64 NumPrimitivesWritten;
1391        *    u64 PrimitiveStorageNeeded;
1392        * }
1393        * We only need NumPrimitivesWritten here. */
1394       result->u64 += si_query_read_result(buffer, 2, 6, true);
1395       break;
1396    case PIPE_QUERY_PRIMITIVES_GENERATED:
1397       /* Here we read PrimitiveStorageNeeded. */
1398       result->u64 += si_query_read_result(buffer, 0, 4, true);
1399       break;
1400    case PIPE_QUERY_SO_STATISTICS:
1401       result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1402       result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1403       break;
1404    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1405       result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1406                                   si_query_read_result(buffer, 0, 4, true);
1407       break;
1408    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1409       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1410          result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1411                                      si_query_read_result(buffer, 0, 4, true);
1412          buffer = (char *)buffer + 32;
1413       }
1414       break;
1415    case PIPE_QUERY_PIPELINE_STATISTICS:
1416       for (int i = 0; i < 11; i++) {
1417          result->pipeline_statistics.counters[i] +=
1418             si_query_read_result(buffer, si_query_pipestat_dw_offset(i),
1419                                  si_query_pipestat_end_dw_offset(sscreen, i), false);
1420       }
1421 #if 0 /* for testing */
1422       printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1423              "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1424              "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1425              result->pipeline_statistics.ia_vertices,
1426              result->pipeline_statistics.ia_primitives,
1427              result->pipeline_statistics.vs_invocations,
1428              result->pipeline_statistics.hs_invocations,
1429              result->pipeline_statistics.ds_invocations,
1430              result->pipeline_statistics.gs_invocations,
1431              result->pipeline_statistics.gs_primitives,
1432              result->pipeline_statistics.c_invocations,
1433              result->pipeline_statistics.c_primitives,
1434              result->pipeline_statistics.ps_invocations,
1435              result->pipeline_statistics.cs_invocations);
1436 #endif
1437       break;
1438    default:
1439       assert(0);
1440    }
1441 }
1442 
si_query_hw_suspend(struct si_context * sctx,struct si_query * query)1443 void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1444 {
1445    si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1446 }
1447 
si_query_hw_resume(struct si_context * sctx,struct si_query * query)1448 void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1449 {
1450    si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1451 }
1452 
1453 static const struct si_query_ops query_hw_ops = {
1454    .destroy = si_query_hw_destroy,
1455    .begin = si_query_hw_begin,
1456    .end = si_query_hw_end,
1457    .get_result = si_query_hw_get_result,
1458    .get_result_resource = si_query_hw_get_result_resource,
1459 
1460    .suspend = si_query_hw_suspend,
1461    .resume = si_query_hw_resume,
1462 };
1463 
si_get_query_result(struct pipe_context * ctx,struct pipe_query * query,bool wait,union pipe_query_result * result)1464 static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1465                                 union pipe_query_result *result)
1466 {
1467    struct si_context *sctx = (struct si_context *)ctx;
1468    struct si_query *squery = (struct si_query *)query;
1469 
1470    return squery->ops->get_result(sctx, squery, wait, result);
1471 }
1472 
si_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1473 static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1474                                          enum pipe_query_flags flags, enum pipe_query_value_type result_type,
1475                                          int index, struct pipe_resource *resource, unsigned offset)
1476 {
1477    struct si_context *sctx = (struct si_context *)ctx;
1478    struct si_query *squery = (struct si_query *)query;
1479 
1480    squery->ops->get_result_resource(sctx, squery, flags, result_type, index, resource, offset);
1481 }
1482 
si_query_hw_clear_result(struct si_query_hw * query,union pipe_query_result * result)1483 static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1484 {
1485    util_query_clear_result(result, query->b.type);
1486 }
1487 
si_query_hw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1488 bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1489                             union pipe_query_result *result)
1490 {
1491    struct si_screen *sscreen = sctx->screen;
1492    struct si_query_hw *query = (struct si_query_hw *)squery;
1493    struct si_query_buffer *qbuf;
1494 
1495    query->ops->clear_result(query, result);
1496 
1497    for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1498       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1499       unsigned results_base = 0;
1500       void *map;
1501 
1502       if (squery->b.flushed)
1503          map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage);
1504       else
1505          map = si_buffer_map(sctx, qbuf->buf, usage);
1506 
1507       if (!map)
1508          return false;
1509 
1510       while (results_base != qbuf->results_end) {
1511          query->ops->add_result(sscreen, query, map + results_base, result);
1512          results_base += query->result_size;
1513       }
1514    }
1515 
1516    /* Convert the time to expected units. */
1517    if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
1518        squery->type == PIPE_QUERY_TIMESTAMP) {
1519       result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1520    }
1521    return true;
1522 }
1523 
si_query_hw_get_result_resource(struct si_context * sctx,struct si_query * squery,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1524 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1525                                             enum pipe_query_flags flags,
1526                                             enum pipe_query_value_type result_type,
1527                                             int index, struct pipe_resource *resource,
1528                                             unsigned offset)
1529 {
1530    struct si_query_hw *query = (struct si_query_hw *)squery;
1531    struct si_query_buffer *qbuf;
1532    struct si_query_buffer *qbuf_prev;
1533    struct pipe_resource *tmp_buffer = NULL;
1534    unsigned tmp_buffer_offset = 0;
1535    struct si_qbo_state saved_state = {};
1536    struct pipe_grid_info grid = {};
1537    struct pipe_constant_buffer constant_buffer = {};
1538    struct pipe_shader_buffer ssbo[3];
1539    struct si_hw_query_params params;
1540    struct {
1541       uint32_t end_offset;
1542       uint32_t result_stride;
1543       uint32_t result_count;
1544       uint32_t config;
1545       uint32_t fence_offset;
1546       uint32_t pair_stride;
1547       uint32_t pair_count;
1548    } consts;
1549 
1550    if (!sctx->query_result_shader) {
1551       sctx->query_result_shader = si_create_query_result_cs(sctx);
1552       if (!sctx->query_result_shader)
1553          return;
1554    }
1555 
1556    if (query->buffer.previous) {
1557       u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1558       if (!tmp_buffer)
1559          return;
1560    }
1561 
1562    si_save_qbo_state(sctx, &saved_state);
1563 
1564    si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
1565    consts.end_offset = params.end_offset - params.start_offset;
1566    consts.fence_offset = params.fence_offset - params.start_offset;
1567    consts.result_stride = query->result_size;
1568    consts.pair_stride = params.pair_stride;
1569    consts.pair_count = params.pair_count;
1570 
1571    constant_buffer.buffer_size = sizeof(consts);
1572    constant_buffer.user_buffer = &consts;
1573 
1574    ssbo[1].buffer = tmp_buffer;
1575    ssbo[1].buffer_offset = tmp_buffer_offset;
1576    ssbo[1].buffer_size = 16;
1577 
1578    ssbo[2] = ssbo[1];
1579 
1580    grid.block[0] = 1;
1581    grid.block[1] = 1;
1582    grid.block[2] = 1;
1583    grid.grid[0] = 1;
1584    grid.grid[1] = 1;
1585    grid.grid[2] = 1;
1586 
1587    consts.config = 0;
1588    if (index < 0)
1589       consts.config |= 4;
1590    if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1591        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1592       consts.config |= 8;
1593    else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1594             query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1595       consts.config |= 8 | 256;
1596    else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1597       consts.config |= 32;
1598 
1599    switch (result_type) {
1600    case PIPE_QUERY_TYPE_U64:
1601    case PIPE_QUERY_TYPE_I64:
1602       consts.config |= 64;
1603       break;
1604    case PIPE_QUERY_TYPE_I32:
1605       consts.config |= 128;
1606       break;
1607    case PIPE_QUERY_TYPE_U32:
1608       break;
1609    }
1610 
1611    sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
1612 
1613    for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1614       if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1615          qbuf_prev = qbuf->previous;
1616          consts.result_count = qbuf->results_end / query->result_size;
1617          consts.config &= ~3;
1618          if (qbuf != &query->buffer)
1619             consts.config |= 1;
1620          if (qbuf->previous)
1621             consts.config |= 2;
1622       } else {
1623          /* Only read the last timestamp. */
1624          qbuf_prev = NULL;
1625          consts.result_count = 0;
1626          consts.config |= 16;
1627          params.start_offset += qbuf->results_end - query->result_size;
1628       }
1629 
1630       sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer);
1631 
1632       ssbo[0].buffer = &qbuf->buf->b.b;
1633       ssbo[0].buffer_offset = params.start_offset;
1634       ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1635 
1636       if (!qbuf->previous) {
1637          ssbo[2].buffer = resource;
1638          ssbo[2].buffer_offset = offset;
1639          ssbo[2].buffer_size = resource->width0 - offset;
1640          /* assert size is correct, based on result_type ? */
1641 
1642          si_resource(resource)->TC_L2_dirty = true;
1643       }
1644 
1645       if ((flags & PIPE_QUERY_WAIT) && qbuf == &query->buffer) {
1646          uint64_t va;
1647 
1648          /* Wait for result availability. Wait only for readiness
1649           * of the last entry, since the fence writes should be
1650           * serialized in the CP.
1651           */
1652          va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1653          va += params.fence_offset;
1654 
1655          si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1656       }
1657       si_launch_grid_internal_ssbos(sctx, &grid, sctx->query_result_shader,
1658                                     SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
1659                                     3, ssbo, 0x4);
1660    }
1661 
1662    si_restore_qbo_state(sctx, &saved_state);
1663    pipe_resource_reference(&tmp_buffer, NULL);
1664 }
1665 
si_render_condition(struct pipe_context * ctx,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1666 static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1667                                 enum pipe_render_cond_flag mode)
1668 {
1669    struct si_context *sctx = (struct si_context *)ctx;
1670    struct si_query_hw *squery = (struct si_query_hw *)query;
1671    struct si_atom *atom = &sctx->atoms.s.render_cond;
1672 
1673    if (query) {
1674       bool needs_workaround = false;
1675 
1676       /* There was a firmware regression in GFX8 which causes successive
1677        * SET_PREDICATION packets to give the wrong answer for
1678        * non-inverted stream overflow predication.
1679        */
1680       if (((sctx->gfx_level == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1681            (sctx->gfx_level == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1682           !condition &&
1683           (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1684            (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1685             (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1686          needs_workaround = true;
1687       }
1688 
1689       if (needs_workaround && !squery->workaround_buf) {
1690          bool old_render_cond_enabled = sctx->render_cond_enabled;
1691          sctx->render_cond_enabled = false;
1692 
1693          u_suballocator_alloc(&sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1694                               (struct pipe_resource **)&squery->workaround_buf);
1695 
1696          /* Reset to NULL to avoid a redundant SET_PREDICATION
1697           * from launching the compute grid.
1698           */
1699          sctx->render_cond = NULL;
1700 
1701          ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1702                                         &squery->workaround_buf->b.b, squery->workaround_offset);
1703 
1704          /* Settings this in the render cond atom is too late,
1705           * so set it here. */
1706          sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
1707 
1708          sctx->render_cond_enabled = old_render_cond_enabled;
1709       }
1710    }
1711 
1712    sctx->render_cond = query;
1713    sctx->render_cond_invert = condition;
1714    sctx->render_cond_mode = mode;
1715    sctx->render_cond_enabled = query;
1716 
1717    si_set_atom_dirty(sctx, atom, query != NULL);
1718 }
1719 
si_suspend_queries(struct si_context * sctx)1720 void si_suspend_queries(struct si_context *sctx)
1721 {
1722    struct si_query *query;
1723 
1724    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1725       query->ops->suspend(sctx, query);
1726 }
1727 
si_resume_queries(struct si_context * sctx)1728 void si_resume_queries(struct si_context *sctx)
1729 {
1730    struct si_query *query;
1731 
1732    /* Check CS space here. Resuming must not be interrupted by flushes. */
1733    si_need_gfx_cs_space(sctx, 0);
1734 
1735    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1736       query->ops->resume(sctx, query);
1737 }
1738 
1739 #define XFULL(name_, query_type_, type_, result_type_, group_id_)                                  \
1740    {                                                                                               \
1741       .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1742       .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_           \
1743    }
1744 
1745 #define X(name_, query_type_, type_, result_type_)                                                 \
1746    XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1747 
1748 #define XG(group_, name_, query_type_, type_, result_type_)                                        \
1749    XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1750 
1751 static struct pipe_driver_query_info si_driver_query_list[] = {
1752    X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1753    X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1754    X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1755    X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1756    X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
1757    X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1758    X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1759    X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1760    X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1761    X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1762    X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1763    X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1764    X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1765    X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1766    X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1767    X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1768    X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1769    X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1770    X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1771    X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1772    X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1773    X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1774    X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1775    X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1776    X("slab-wasted-VRAM", SLAB_WASTED_VRAM, BYTES, AVERAGE),
1777    X("slab-wasted-GTT", SLAB_WASTED_GTT, BYTES, AVERAGE),
1778    X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1779    X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1780    X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1781    X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1782    X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1783    X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1784    X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1785    X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1786    X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1787    X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1788    X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1789    X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1790    X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1791    X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1792    X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1793    X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1794    X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1795    X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1796 
1797    /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1798     * which use it as a fallback path to detect the GPU type.
1799     *
1800     * Note: The names of these queries are significant for GPUPerfStudio
1801     * (and possibly their order as well). */
1802    XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1803    XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1804    XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1805    XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1806    XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1807 
1808    X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1809    X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1810    X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1811 
1812    /* The following queries must be at the end of the list because their
1813     * availability is adjusted dynamically based on the DRM version. */
1814    X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1815    X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1816    X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1817    X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1818    X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1819    X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1820    X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1821    X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1822    X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1823    X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1824    X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1825    X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1826    X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1827    X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1828 
1829    /* SRBM_STATUS2 */
1830    X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1831 
1832    /* CP_STAT */
1833    X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1834    X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1835    X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1836    X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1837    X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1838    X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1839 };
1840 
1841 #undef X
1842 #undef XG
1843 #undef XFULL
1844 
si_get_num_queries(struct si_screen * sscreen)1845 static unsigned si_get_num_queries(struct si_screen *sscreen)
1846 {
1847    /* amdgpu */
1848    if (sscreen->info.is_amdgpu) {
1849       if (sscreen->info.gfx_level >= GFX8)
1850          return ARRAY_SIZE(si_driver_query_list);
1851       else
1852          return ARRAY_SIZE(si_driver_query_list) - 7;
1853    }
1854 
1855    /* radeon */
1856    if (sscreen->info.gfx_level == GFX7)
1857       return ARRAY_SIZE(si_driver_query_list) - 6;
1858    else
1859       return ARRAY_SIZE(si_driver_query_list) - 7;
1860 
1861    return ARRAY_SIZE(si_driver_query_list) - 21;
1862 }
1863 
si_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)1864 static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1865                                     struct pipe_driver_query_info *info)
1866 {
1867    struct si_screen *sscreen = (struct si_screen *)screen;
1868    unsigned num_queries = si_get_num_queries(sscreen);
1869 
1870    if (!info) {
1871       unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1872 
1873       return num_queries + num_perfcounters;
1874    }
1875 
1876    if (index >= num_queries)
1877       return si_get_perfcounter_info(sscreen, index - num_queries, info);
1878 
1879    *info = si_driver_query_list[index];
1880 
1881    switch (info->query_type) {
1882    case SI_QUERY_REQUESTED_VRAM:
1883    case SI_QUERY_VRAM_USAGE:
1884    case SI_QUERY_MAPPED_VRAM:
1885    case SI_QUERY_SLAB_WASTED_VRAM:
1886       info->max_value.u64 = (uint64_t)sscreen->info.vram_size_kb * 1024;
1887       break;
1888    case SI_QUERY_REQUESTED_GTT:
1889    case SI_QUERY_GTT_USAGE:
1890    case SI_QUERY_MAPPED_GTT:
1891    case SI_QUERY_SLAB_WASTED_GTT:
1892       info->max_value.u64 = (uint64_t)sscreen->info.gart_size_kb * 1024;
1893       break;
1894    case SI_QUERY_GPU_TEMPERATURE:
1895       info->max_value.u64 = 125;
1896       break;
1897    case SI_QUERY_VRAM_VIS_USAGE:
1898       info->max_value.u64 = (uint64_t)sscreen->info.vram_vis_size_kb * 1024;
1899       break;
1900    }
1901 
1902    if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1903       info->group_id += sscreen->perfcounters->base.num_groups;
1904 
1905    return 1;
1906 }
1907 
1908 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1909  * performance counter groups, so be careful when changing this and related
1910  * functions.
1911  */
si_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1912 static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1913                                           struct pipe_driver_query_group_info *info)
1914 {
1915    struct si_screen *sscreen = (struct si_screen *)screen;
1916    unsigned num_pc_groups = 0;
1917 
1918    if (sscreen->perfcounters)
1919       num_pc_groups = sscreen->perfcounters->base.num_groups;
1920 
1921    if (!info)
1922       return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1923 
1924    if (index < num_pc_groups)
1925       return si_get_perfcounter_group_info(sscreen, index, info);
1926 
1927    index -= num_pc_groups;
1928    if (index >= SI_NUM_SW_QUERY_GROUPS)
1929       return 0;
1930 
1931    info->name = "GPIN";
1932    info->max_active_queries = 5;
1933    info->num_queries = 5;
1934    return 1;
1935 }
1936 
si_init_query_functions(struct si_context * sctx)1937 void si_init_query_functions(struct si_context *sctx)
1938 {
1939    sctx->b.create_query = si_create_query;
1940    sctx->b.create_batch_query = si_create_batch_query;
1941    sctx->b.destroy_query = si_destroy_query;
1942    sctx->b.begin_query = si_begin_query;
1943    sctx->b.end_query = si_end_query;
1944    sctx->b.get_query_result = si_get_query_result;
1945    sctx->b.get_query_result_resource = si_get_query_result_resource;
1946 
1947    if (sctx->has_graphics) {
1948       sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1949       sctx->b.render_condition = si_render_condition;
1950    }
1951 
1952    list_inithead(&sctx->active_queries);
1953 }
1954 
si_init_screen_query_functions(struct si_screen * sscreen)1955 void si_init_screen_query_functions(struct si_screen *sscreen)
1956 {
1957    sscreen->b.get_driver_query_info = si_get_driver_query_info;
1958    sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1959 }
1960