• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * Based on anv:
4  * Copyright © 2015 Intel Corporation
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include <assert.h>
10 #include <fcntl.h>
11 #include <stdbool.h>
12 #include <string.h>
13 
14 #include "bvh/bvh.h"
15 #include "meta/radv_meta.h"
16 #include "nir/nir_builder.h"
17 #include "util/u_atomic.h"
18 #include "vulkan/vulkan_core.h"
19 #include "radv_cs.h"
20 #include "radv_entrypoints.h"
21 #include "radv_perfcounter.h"
22 #include "radv_query.h"
23 #include "radv_rmv.h"
24 #include "sid.h"
25 #include "vk_acceleration_structure.h"
26 #include "vk_common_entrypoints.h"
27 #include "vk_shader_module.h"
28 
29 #define TIMESTAMP_NOT_READY UINT64_MAX
30 
31 static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkQueryType query_type,
32                               struct radeon_winsys_bo *src_bo, struct radeon_winsys_bo *dst_bo, uint64_t src_offset,
33                               uint64_t dst_offset, uint32_t src_stride, uint32_t dst_stride, size_t dst_size,
34                               uint32_t count, uint32_t flags, uint32_t pipeline_stats_mask, uint32_t avail_offset,
35                               bool uses_emulated_queries);
36 
37 static void
gfx10_copy_shader_query(struct radeon_cmdbuf * cs,uint32_t src_sel,uint64_t src_va,uint64_t dst_va)38 gfx10_copy_shader_query(struct radeon_cmdbuf *cs, uint32_t src_sel, uint64_t src_va, uint64_t dst_va)
39 {
40    radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
41    radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
42    radeon_emit(cs, src_va);
43    radeon_emit(cs, src_va >> 32);
44    radeon_emit(cs, dst_va);
45    radeon_emit(cs, dst_va >> 32);
46 }
47 
48 static void
gfx10_copy_shader_query_gfx(struct radv_cmd_buffer * cmd_buffer,bool use_gds,uint32_t src_offset,uint64_t dst_va)49 gfx10_copy_shader_query_gfx(struct radv_cmd_buffer *cmd_buffer, bool use_gds, uint32_t src_offset, uint64_t dst_va)
50 {
51    uint32_t src_sel;
52    uint64_t src_va;
53 
54    if (use_gds) {
55       /* Make sure GDS is idle before copying the value. */
56       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
57 
58       src_sel = COPY_DATA_GDS;
59       src_va = src_offset;
60    } else {
61       /* Make sure GE is idle before copying the value. */
62       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
63 
64       src_sel = COPY_DATA_SRC_MEM;
65       src_va = cmd_buffer->state.shader_query_buf_va + src_offset - RADV_SHADER_QUERY_PRIM_GEN_OFFSET(0);
66    }
67 
68    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2;
69    radv_emit_cache_flush(cmd_buffer);
70 
71    gfx10_copy_shader_query(cmd_buffer->cs, src_sel, src_va, dst_va);
72 }
73 
74 static void
gfx10_copy_shader_query_ace(struct radv_cmd_buffer * cmd_buffer,uint32_t src_offset,uint64_t src_va)75 gfx10_copy_shader_query_ace(struct radv_cmd_buffer *cmd_buffer, uint32_t src_offset, uint64_t src_va)
76 {
77    /* Make sure GDS is idle before copying the value. */
78    cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
79    radv_gang_cache_flush(cmd_buffer);
80 
81    gfx10_copy_shader_query(cmd_buffer->gang.cs, COPY_DATA_GDS, src_offset, src_va);
82 }
83 
84 static void
radv_store_availability(nir_builder * b,nir_def * flags,nir_def * dst_buf,nir_def * offset,nir_def * value32)85 radv_store_availability(nir_builder *b, nir_def *flags, nir_def *dst_buf, nir_def *offset, nir_def *value32)
86 {
87    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
88 
89    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
90 
91    nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .align_mul = 8);
92 
93    nir_push_else(b, NULL);
94 
95    nir_store_ssbo(b, value32, dst_buf, offset);
96 
97    nir_pop_if(b, NULL);
98 
99    nir_pop_if(b, NULL);
100 }
101 
102 static size_t
radv_query_result_size(const struct radv_query_pool * pool,VkQueryResultFlags flags)103 radv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags)
104 {
105    unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0;
106    switch (pool->vk.query_type) {
107    case VK_QUERY_TYPE_TIMESTAMP:
108    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
109    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
110    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
111    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
112    case VK_QUERY_TYPE_OCCLUSION:
113    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
114       values += 1;
115       break;
116    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
117       values += util_bitcount(pool->vk.pipeline_statistics);
118       break;
119    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
120       values += 2;
121       break;
122    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
123       values += 1;
124       break;
125    case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
126       values += 1;
127       break;
128    default:
129       unreachable("trying to get size of unhandled query type");
130    }
131    return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4);
132 }
133 
134 /**
135  * Occlusion query
136  */
137 static bool
radv_occlusion_query_use_l2(const struct radv_physical_device * pdev)138 radv_occlusion_query_use_l2(const struct radv_physical_device *pdev)
139 {
140    /* Occlusion query writes don't go through L2 on GFX6-8 which means the driver would need to
141     * flush caches before every read in shaders or use MTYPE=3 (ie. uncached) in the buffer
142     * descriptor to bypass L2. Use the WAIT_REG_MEM logic instead which is easier to implement.
143     */
144    return pdev->info.gfx_level >= GFX9;
145 }
146 
147 static nir_shader *
build_occlusion_query_shader(struct radv_device * device)148 build_occlusion_query_shader(struct radv_device *device)
149 {
150    /* the shader this builds is roughly
151     *
152     * push constants {
153     * 	uint32_t flags;
154     * 	uint32_t dst_stride;
155     * };
156     *
157     * uint32_t src_stride = 16 * db_count;
158     *
159     * location(binding = 0) buffer dst_buf;
160     * location(binding = 1) buffer src_buf;
161     *
162     * void main() {
163     * 	uint64_t result = 0;
164     * 	uint64_t src_offset = src_stride * global_id.x;
165     * 	uint64_t dst_offset = dst_stride * global_id.x;
166     * 	bool available = true;
167     * 	for (int i = 0; i < db_count; ++i) {
168     *		if (enabled_rb_mask & BITFIELD64_BIT(i)) {
169     *			uint64_t start = src_buf[src_offset + 16 * i];
170     *			uint64_t end = src_buf[src_offset + 16 * i + 8];
171     *			if ((start & (1ull << 63)) && (end & (1ull << 63)))
172     *				result += end - start;
173     *			else
174     *				available = false;
175     *		}
176     * 	}
177     * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
178     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
179     * 		if (flags & VK_QUERY_RESULT_64_BIT)
180     * 			dst_buf[dst_offset] = result;
181     * 		else
182     * 			dst_buf[dst_offset] = (uint32_t)result.
183     * 	}
184     * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
185     * 		dst_buf[dst_offset + elem_size] = available;
186     * 	}
187     * }
188     */
189    const struct radv_physical_device *pdev = radv_device_physical(device);
190    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "occlusion_query");
191    b.shader->info.workgroup_size[0] = 64;
192 
193    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
194    nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
195    nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
196    nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
197    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
198    uint64_t enabled_rb_mask = pdev->info.enabled_rb_mask;
199    unsigned db_count = pdev->info.max_render_backends;
200 
201    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
202 
203    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
204    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
205 
206    nir_def *global_id = get_global_ids(&b, 1);
207 
208    nir_def *input_stride = nir_imm_int(&b, db_count * 16);
209    nir_def *input_base = nir_imul(&b, input_stride, global_id);
210    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
211    nir_def *output_base = nir_imul(&b, output_stride, global_id);
212 
213    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
214    nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
215    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
216 
217    if (radv_occlusion_query_use_l2(pdev)) {
218       nir_def *query_result_wait = nir_test_mask(&b, flags, VK_QUERY_RESULT_WAIT_BIT);
219       nir_push_if(&b, query_result_wait);
220       {
221          /* Wait on the upper word of the last DB entry. */
222          nir_push_loop(&b);
223          {
224             const uint32_t rb_avail_offset = 16 * util_last_bit64(enabled_rb_mask) - 4;
225 
226             /* Prevent the SSBO load to be moved out of the loop. */
227             nir_scoped_memory_barrier(&b, SCOPE_INVOCATION, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo);
228 
229             nir_def *load_offset = nir_iadd_imm(&b, input_base, rb_avail_offset);
230             nir_def *load = nir_load_ssbo(&b, 1, 32, src_buf, load_offset, .align_mul = 4, .access = ACCESS_COHERENT);
231 
232             nir_push_if(&b, nir_ige_imm(&b, load, 0x80000000));
233             {
234                nir_jump(&b, nir_jump_break);
235             }
236             nir_pop_if(&b, NULL);
237          }
238          nir_pop_loop(&b, NULL);
239       }
240       nir_pop_if(&b, NULL);
241    }
242 
243    nir_push_loop(&b);
244 
245    nir_def *current_outer_count = nir_load_var(&b, outer_counter);
246    radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
247 
248    nir_def *enabled_cond = nir_iand_imm(&b, nir_ishl(&b, nir_imm_int64(&b, 1), current_outer_count), enabled_rb_mask);
249 
250    nir_push_if(&b, nir_i2b(&b, enabled_cond));
251 
252    nir_def *load_offset = nir_imul_imm(&b, current_outer_count, 16);
253    load_offset = nir_iadd(&b, input_base, load_offset);
254 
255    nir_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16);
256 
257    nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1);
258    nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1);
259 
260    nir_def *start_done = nir_ilt_imm(&b, nir_load_var(&b, start), 0);
261    nir_def *end_done = nir_ilt_imm(&b, nir_load_var(&b, end), 0);
262 
263    nir_push_if(&b, nir_iand(&b, start_done, end_done));
264 
265    nir_store_var(&b, result,
266                  nir_iadd(&b, nir_load_var(&b, result), nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))),
267                  0x1);
268 
269    nir_push_else(&b, NULL);
270 
271    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
272 
273    nir_pop_if(&b, NULL);
274    nir_pop_if(&b, NULL);
275    nir_pop_loop(&b, NULL);
276 
277    /* Store the result if complete or if partial results have been requested. */
278 
279    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
280    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
281    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
282 
283    nir_push_if(&b, result_is_64bit);
284 
285    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .align_mul = 8);
286 
287    nir_push_else(&b, NULL);
288 
289    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base, .align_mul = 8);
290 
291    nir_pop_if(&b, NULL);
292    nir_pop_if(&b, NULL);
293 
294    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
295                            nir_b2i32(&b, nir_load_var(&b, available)));
296 
297    return b.shader;
298 }
299 
300 static void
radv_begin_occlusion_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va,VkQueryControlFlags flags)301 radv_begin_occlusion_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkQueryControlFlags flags)
302 {
303    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
304    const struct radv_physical_device *pdev = radv_device_physical(device);
305    struct radeon_cmdbuf *cs = cmd_buffer->cs;
306 
307    radeon_check_space(device->ws, cs, 11);
308 
309    ++cmd_buffer->state.active_occlusion_queries;
310    if (cmd_buffer->state.active_occlusion_queries == 1) {
311       if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
312          /* This is the first occlusion query, enable
313           * the hint if the precision bit is set.
314           */
315          cmd_buffer->state.perfect_occlusion_queries_enabled = true;
316       }
317 
318       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
319    } else {
320       if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) && !cmd_buffer->state.perfect_occlusion_queries_enabled) {
321          /* This is not the first query, but this one
322           * needs to enable precision, DB_COUNT_CONTROL
323           * has to be updated accordingly.
324           */
325          cmd_buffer->state.perfect_occlusion_queries_enabled = true;
326 
327          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
328       }
329    }
330 
331    if (pdev->info.gfx_level >= GFX11 && pdev->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
332       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
333    } else {
334       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
335       if (pdev->info.gfx_level >= GFX11) {
336          radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
337       } else {
338          radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
339       }
340    }
341    radeon_emit(cs, va);
342    radeon_emit(cs, va >> 32);
343 }
344 
345 static void
radv_end_occlusion_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va)346 radv_end_occlusion_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
347 {
348    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
349    const struct radv_physical_device *pdev = radv_device_physical(device);
350    struct radeon_cmdbuf *cs = cmd_buffer->cs;
351 
352    radeon_check_space(device->ws, cs, 14);
353 
354    cmd_buffer->state.active_occlusion_queries--;
355    if (cmd_buffer->state.active_occlusion_queries == 0) {
356       /* Reset the perfect occlusion queries hint now that no
357        * queries are active.
358        */
359       cmd_buffer->state.perfect_occlusion_queries_enabled = false;
360 
361       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
362    }
363 
364    if (pdev->info.gfx_level >= GFX11 && pdev->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
365       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
366    } else {
367       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
368       if (pdev->info.gfx_level >= GFX11) {
369          radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
370       } else {
371          radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
372       }
373    }
374    radeon_emit(cs, va + 8);
375    radeon_emit(cs, (va + 8) >> 32);
376 }
377 
378 static void
radv_copy_occlusion_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)379 radv_copy_occlusion_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
380                                  uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset,
381                                  uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
382 {
383    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
384    const struct radv_physical_device *pdev = radv_device_physical(device);
385    struct radeon_cmdbuf *cs = cmd_buffer->cs;
386    uint64_t va = radv_buffer_get_va(pool->bo);
387 
388    if (!radv_occlusion_query_use_l2(pdev)) {
389       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
390          uint64_t enabled_rb_mask = pdev->info.enabled_rb_mask;
391          uint32_t rb_avail_offset = 16 * util_last_bit64(enabled_rb_mask) - 4;
392          for (unsigned i = 0; i < query_count; ++i) {
393             unsigned query = first_query + i;
394             uint64_t src_va = va + query * pool->stride + rb_avail_offset;
395 
396             radeon_check_space(device->ws, cs, 7);
397 
398             /* Waits on the upper word of the last DB entry */
399             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va, 0x80000000, 0xffffffff);
400          }
401       }
402    }
403 
404    radv_query_shader(cmd_buffer, VK_QUERY_TYPE_OCCLUSION, pool->bo, dst_bo, first_query * pool->stride, dst_offset,
405                      pool->stride, stride, dst_size, query_count, flags, 0, 0, false);
406 }
407 
408 /**
409  * Pipeline stat query
410  */
411 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10, 13, 11, 12};
412 
413 static unsigned
radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)414 radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)
415 {
416    uint32_t idx = ffs(query) - 1;
417    return pipeline_statistics_indices[idx] * 8;
418 }
419 
420 static unsigned
radv_get_pipelinestat_query_size(struct radv_device * device)421 radv_get_pipelinestat_query_size(struct radv_device *device)
422 {
423    /* GFX10_3 only has 11 valid pipeline statistics queries but in order to emulate mesh/task shader
424     * invocations, it's easier to use the same size as GFX11.
425     */
426    const struct radv_physical_device *pdev = radv_device_physical(device);
427    unsigned num_results = pdev->info.gfx_level >= GFX10_3 ? 14 : 11;
428    return num_results * 8;
429 }
430 
431 static nir_shader *
build_pipeline_statistics_query_shader(struct radv_device * device)432 build_pipeline_statistics_query_shader(struct radv_device *device)
433 {
434    unsigned pipelinestat_block_size = +radv_get_pipelinestat_query_size(device);
435 
436    /* the shader this builds is roughly
437     *
438     * push constants {
439     * 	uint32_t flags;
440     * 	uint32_t dst_stride;
441     * 	uint32_t stats_mask;
442     * 	uint32_t avail_offset;
443     * };
444     *
445     * uint32_t src_stride = pipelinestat_block_size * 2;
446     *
447     * location(binding = 0) buffer dst_buf;
448     * location(binding = 1) buffer src_buf;
449     *
450     * void main() {
451     * 	uint64_t src_offset = src_stride * global_id.x;
452     * 	uint64_t dst_base = dst_stride * global_id.x;
453     * 	uint64_t dst_offset = dst_base;
454     * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
455     * 	uint32_t elem_count = stats_mask >> 16;
456     * 	uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
457     * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
458     * 		dst_buf[dst_offset + elem_count * elem_size] = available32;
459     * 	}
460     * 	if ((bool)available32) {
461     * 		// repeat 11 times:
462     * 		if (stats_mask & (1 << 0)) {
463     * 			uint64_t start = src_buf[src_offset + 8 * indices[0]];
464     * 			uint64_t end = src_buf[src_offset + 8 * indices[0] +
465     * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT)
466     * 				dst_buf[dst_offset] = result;
467     * 			else
468     * 				dst_buf[dst_offset] = (uint32_t)result.
469     * 			dst_offset += elem_size;
470     * 		}
471     * 	} else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
472     *              // Set everything to 0 as we don't know what is valid.
473     * 		for (int i = 0; i < elem_count; ++i)
474     * 			dst_buf[dst_base + elem_size * i] = 0;
475     * 	}
476     * }
477     */
478    const struct radv_physical_device *pdev = radv_device_physical(device);
479    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pipeline_statistics_query");
480    b.shader->info.workgroup_size[0] = 64;
481 
482    nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
483    nir_variable *result = nir_local_variable_create(b.impl, glsl_int64_t_type(), "result");
484    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
485 
486    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
487    nir_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12);
488    nir_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
489    nir_def *uses_emulated_queries = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
490 
491    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
492    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
493 
494    nir_def *global_id = get_global_ids(&b, 1);
495 
496    nir_def *input_stride =
497       nir_bcsel(&b, nir_ine_imm(&b, uses_emulated_queries, 0), nir_imm_int(&b, pipelinestat_block_size * 2 + 8 * 2),
498                 nir_imm_int(&b, pipelinestat_block_size * 2));
499    nir_def *input_base = nir_imul(&b, input_stride, global_id);
500 
501    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
502    nir_def *output_base = nir_imul(&b, output_stride, global_id);
503 
504    avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4));
505 
506    nir_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset);
507    nir_store_var(&b, available, nir_i2b(&b, available32), 0x1);
508 
509    if (pdev->emulate_mesh_shader_queries) {
510       nir_push_if(&b, nir_test_mask(&b, stats_mask, VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT));
511       {
512          const uint32_t idx = ffs(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT) - 1;
513 
514          nir_def *avail_start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + 4);
515          nir_def *avail_start = nir_load_ssbo(&b, 1, 32, src_buf, avail_start_offset);
516 
517          nir_def *avail_end_offset =
518             nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + pipelinestat_block_size + 4);
519          nir_def *avail_end = nir_load_ssbo(&b, 1, 32, src_buf, avail_end_offset);
520 
521          nir_def *task_invoc_result_available =
522             nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avail_start, avail_end), 0x80000000));
523 
524          nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), task_invoc_result_available), 0x1);
525       }
526       nir_pop_if(&b, NULL);
527    }
528 
529    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
530    nir_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
531    nir_def *elem_count = nir_ushr_imm(&b, stats_mask, 16);
532 
533    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
534                            nir_b2i32(&b, nir_load_var(&b, available)));
535 
536    nir_push_if(&b, nir_load_var(&b, available));
537 
538    nir_store_var(&b, output_offset, output_base, 0x1);
539    for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
540       nir_push_if(&b, nir_test_mask(&b, stats_mask, BITFIELD64_BIT(i)));
541 
542       nir_def *start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8);
543       nir_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset);
544 
545       nir_def *end_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size);
546       nir_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset);
547 
548       nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
549 
550       nir_push_if(&b,
551                   nir_iand(&b, nir_i2b(&b, uses_emulated_queries),
552                            nir_imm_bool(&b, 1u << i == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)));
553       {
554          /* Compute the emulated result if needed. */
555          nir_def *emu_start_offset = nir_iadd_imm(&b, input_base, pipelinestat_block_size * 2);
556          nir_def *emu_start = nir_load_ssbo(&b, 1, 64, src_buf, emu_start_offset);
557 
558          nir_def *emu_end_offset = nir_iadd_imm(&b, input_base, pipelinestat_block_size * 2 + 8);
559          nir_def *emu_end = nir_load_ssbo(&b, 1, 64, src_buf, emu_end_offset);
560 
561          nir_def *ngg_emu_result = nir_isub(&b, emu_end, emu_start);
562 
563          nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_emu_result), 0x1);
564       }
565       nir_pop_if(&b, NULL);
566 
567       /* Store result */
568       nir_push_if(&b, result_is_64bit);
569 
570       nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, nir_load_var(&b, output_offset));
571 
572       nir_push_else(&b, NULL);
573 
574       nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, nir_load_var(&b, output_offset));
575 
576       nir_pop_if(&b, NULL);
577 
578       nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size), 0x1);
579 
580       nir_pop_if(&b, NULL);
581    }
582 
583    nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
584 
585    nir_push_if(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
586 
587    /* Stores zeros in all outputs. */
588 
589    nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
590    nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
591 
592    nir_loop *loop = nir_push_loop(&b);
593 
594    nir_def *current_counter = nir_load_var(&b, counter);
595    radv_break_on_count(&b, counter, elem_count);
596 
597    nir_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter));
598    nir_push_if(&b, result_is_64bit);
599 
600    nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem);
601 
602    nir_push_else(&b, NULL);
603 
604    nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem);
605 
606    nir_pop_if(&b, NULL);
607 
608    nir_pop_loop(&b, loop);
609    nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
610    nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
611    return b.shader;
612 }
613 
614 static void
radv_update_hw_pipelinestat(struct radv_cmd_buffer * cmd_buffer)615 radv_update_hw_pipelinestat(struct radv_cmd_buffer *cmd_buffer)
616 {
617    const uint32_t num_pipeline_stat_queries = radv_get_num_pipeline_stat_queries(cmd_buffer);
618 
619    if (num_pipeline_stat_queries == 0) {
620       cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
621       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
622    } else if (num_pipeline_stat_queries == 1) {
623       cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
624       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
625    }
626 }
627 
628 static void
radv_begin_pipeline_stat_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va)629 radv_begin_pipeline_stat_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va)
630 {
631    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
632    const struct radv_physical_device *pdev = radv_device_physical(device);
633    struct radeon_cmdbuf *cs = cmd_buffer->cs;
634 
635    radeon_check_space(device->ws, cs, 4);
636 
637    ++cmd_buffer->state.active_pipeline_queries;
638 
639    radv_update_hw_pipelinestat(cmd_buffer);
640 
641    if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
642       uint32_t cs_invoc_offset =
643          radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
644       va += cs_invoc_offset;
645    }
646 
647    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
648    radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
649    radeon_emit(cs, va);
650    radeon_emit(cs, va >> 32);
651 
652    if (pool->uses_emulated_queries) {
653       if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
654          const unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
655          const uint64_t prim_va = va + pipelinestat_block_size * 2;
656 
657          gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, prim_va);
658       }
659 
660       if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
661          uint32_t mesh_invoc_offset =
662             radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
663 
664          gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
665       }
666 
667       /* Record that the command buffer needs GDS. */
668       cmd_buffer->gds_needed = true;
669 
670       if (!cmd_buffer->state.active_emulated_pipeline_queries)
671          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
672 
673       cmd_buffer->state.active_emulated_pipeline_queries++;
674    }
675 
676    if (pool->uses_ace) {
677       uint32_t task_invoc_offset =
678          radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
679 
680       if (pdev->info.gfx_level >= GFX11) {
681          va += task_invoc_offset;
682 
683          radeon_check_space(device->ws, cmd_buffer->gang.cs, 4);
684 
685          radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
686          radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
687          radeon_emit(cmd_buffer->gang.cs, va);
688          radeon_emit(cmd_buffer->gang.cs, va >> 32);
689       } else {
690          radeon_check_space(device->ws, cmd_buffer->gang.cs, 11);
691 
692          gfx10_copy_shader_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
693          radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
694 
695          /* Record that the command buffer needs GDS. */
696          cmd_buffer->gds_needed = true;
697 
698          if (!cmd_buffer->state.active_pipeline_ace_queries)
699             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
700 
701          cmd_buffer->state.active_pipeline_ace_queries++;
702       }
703    }
704 }
705 
706 static void
radv_end_pipeline_stat_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va)707 radv_end_pipeline_stat_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va,
708                              uint64_t avail_va)
709 {
710    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
711    const struct radv_physical_device *pdev = radv_device_physical(device);
712    struct radeon_cmdbuf *cs = cmd_buffer->cs;
713 
714    unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
715 
716    radeon_check_space(device->ws, cs, 16);
717 
718    cmd_buffer->state.active_pipeline_queries--;
719 
720    radv_update_hw_pipelinestat(cmd_buffer);
721 
722    va += pipelinestat_block_size;
723 
724    if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
725       uint32_t cs_invoc_offset =
726          radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
727       va += cs_invoc_offset;
728    }
729 
730    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
731    radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
732    radeon_emit(cs, va);
733    radeon_emit(cs, va >> 32);
734 
735    if (pool->uses_emulated_queries) {
736       if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
737          const uint64_t prim_va = va + pipelinestat_block_size + 8;
738 
739          gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, prim_va);
740       }
741 
742       if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
743          uint32_t mesh_invoc_offset =
744             radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
745 
746          gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
747       }
748 
749       cmd_buffer->state.active_emulated_pipeline_queries--;
750 
751       if (!cmd_buffer->state.active_emulated_pipeline_queries)
752          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
753    }
754 
755    if (pool->uses_ace) {
756       uint32_t task_invoc_offset =
757          radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
758 
759       if (pdev->info.gfx_level >= GFX11) {
760          va += task_invoc_offset;
761 
762          radeon_check_space(device->ws, cmd_buffer->gang.cs, 4);
763 
764          radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
765          radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
766          radeon_emit(cmd_buffer->gang.cs, va);
767          radeon_emit(cmd_buffer->gang.cs, va >> 32);
768       } else {
769          radeon_check_space(device->ws, cmd_buffer->gang.cs, 11);
770 
771          gfx10_copy_shader_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
772          radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
773 
774          cmd_buffer->state.active_pipeline_ace_queries--;
775 
776          if (!cmd_buffer->state.active_pipeline_ace_queries)
777             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
778       }
779    }
780 
781    radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
782                                 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va);
783 }
784 
785 static void
radv_copy_pipeline_stat_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)786 radv_copy_pipeline_stat_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool,
787                                      uint32_t first_query, uint32_t query_count, struct radeon_winsys_bo *dst_bo,
788                                      uint64_t dst_offset, uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
789 {
790    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
791    const struct radv_physical_device *pdev = radv_device_physical(device);
792    struct radeon_cmdbuf *cs = cmd_buffer->cs;
793    uint64_t va = radv_buffer_get_va(pool->bo);
794 
795    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
796       const uint32_t task_invoc_offset =
797          radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
798       const unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
799 
800       for (unsigned i = 0; i < query_count; ++i) {
801          unsigned query = first_query + i;
802 
803          radeon_check_space(device->ws, cs, 7);
804 
805          uint64_t avail_va = va + pool->availability_offset + 4 * query;
806 
807          /* This waits on the ME. All copies below are done on the ME */
808          radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
809 
810          if (pool->uses_ace && pdev->emulate_mesh_shader_queries) {
811             const uint64_t src_va = va + query * pool->stride;
812             const uint64_t start_va = src_va + task_invoc_offset + 4;
813             const uint64_t stop_va = start_va + pipelinestat_block_size;
814 
815             radeon_check_space(device->ws, cs, 7 * 2);
816 
817             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, start_va, 0x80000000, 0xffffffff);
818             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, stop_va, 0x80000000, 0xffffffff);
819          }
820       }
821    }
822 
823    radv_query_shader(cmd_buffer, VK_QUERY_TYPE_PIPELINE_STATISTICS, pool->bo, dst_bo, first_query * pool->stride,
824                      dst_offset, pool->stride, stride, dst_size, query_count, flags, pool->vk.pipeline_statistics,
825                      pool->availability_offset + 4 * first_query, pool->uses_emulated_queries);
826 }
827 
828 /**
829  * Transform feedback query
830  */
831 static nir_shader *
build_tfb_query_shader(struct radv_device * device)832 build_tfb_query_shader(struct radv_device *device)
833 {
834    /* the shader this builds is roughly
835     *
836     * uint32_t src_stride = 32;
837     *
838     * location(binding = 0) buffer dst_buf;
839     * location(binding = 1) buffer src_buf;
840     *
841     * void main() {
842     *	uint64_t result[2] = {};
843     *	bool available = false;
844     *	uint64_t src_offset = src_stride * global_id.x;
845     * 	uint64_t dst_offset = dst_stride * global_id.x;
846     * 	uint64_t *src_data = src_buf[src_offset];
847     *	uint32_t avail = (src_data[0] >> 32) &
848     *			 (src_data[1] >> 32) &
849     *			 (src_data[2] >> 32) &
850     *			 (src_data[3] >> 32);
851     *	if (avail & 0x80000000) {
852     *		result[0] = src_data[3] - src_data[1];
853     *		result[1] = src_data[2] - src_data[0];
854     *		available = true;
855     *	}
856     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
857     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
858     *		if (flags & VK_QUERY_RESULT_64_BIT) {
859     *			dst_buf[dst_offset] = result;
860     *		} else {
861     *			dst_buf[dst_offset] = (uint32_t)result;
862     *		}
863     *	}
864     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
865     *		dst_buf[dst_offset + result_size] = available;
866     * 	}
867     * }
868     */
869    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "tfb_query");
870    b.shader->info.workgroup_size[0] = 64;
871 
872    /* Create and initialize local variables. */
873    nir_variable *result = nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result");
874    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
875 
876    nir_store_var(&b, result, nir_replicate(&b, nir_imm_int64(&b, 0), 2), 0x3);
877    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
878 
879    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
880 
881    /* Load resources. */
882    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
883    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
884 
885    /* Compute global ID. */
886    nir_def *global_id = get_global_ids(&b, 1);
887 
888    /* Compute src/dst strides. */
889    nir_def *input_stride = nir_imm_int(&b, 32);
890    nir_def *input_base = nir_imul(&b, input_stride, global_id);
891    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
892    nir_def *output_base = nir_imul(&b, output_stride, global_id);
893 
894    /* Load data from the query pool. */
895    nir_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32);
896    nir_def *load2 = nir_load_ssbo(&b, 4, 32, src_buf, nir_iadd_imm(&b, input_base, 16), .align_mul = 16);
897 
898    /* Check if result is available. */
899    nir_def *avails[2];
900    avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3));
901    avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3));
902    nir_def *result_is_available = nir_test_mask(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000);
903 
904    /* Only compute result if available. */
905    nir_push_if(&b, result_is_available);
906 
907    /* Pack values. */
908    nir_def *packed64[4];
909    packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
910    packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3)));
911    packed64[2] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
912    packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3)));
913 
914    /* Compute result. */
915    nir_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]);
916    nir_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]);
917 
918    nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3);
919    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
920 
921    nir_pop_if(&b, NULL);
922 
923    /* Determine if result is 64 or 32 bit. */
924    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
925    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
926 
927    /* Store the result if complete or partial results have been requested. */
928    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
929 
930    /* Store result. */
931    nir_push_if(&b, result_is_64bit);
932 
933    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
934 
935    nir_push_else(&b, NULL);
936 
937    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
938 
939    nir_pop_if(&b, NULL);
940    nir_pop_if(&b, NULL);
941 
942    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
943                            nir_b2i32(&b, nir_load_var(&b, available)));
944 
945    return b.shader;
946 }
947 
948 static unsigned
event_type_for_stream(unsigned stream)949 event_type_for_stream(unsigned stream)
950 {
951    switch (stream) {
952    default:
953    case 0:
954       return V_028A90_SAMPLE_STREAMOUTSTATS;
955    case 1:
956       return V_028A90_SAMPLE_STREAMOUTSTATS1;
957    case 2:
958       return V_028A90_SAMPLE_STREAMOUTSTATS2;
959    case 3:
960       return V_028A90_SAMPLE_STREAMOUTSTATS3;
961    }
962 }
963 
964 static void
emit_sample_streamout(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)965 emit_sample_streamout(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
966 {
967    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
968    struct radeon_cmdbuf *cs = cmd_buffer->cs;
969 
970    radeon_check_space(device->ws, cs, 4);
971 
972    assert(index < MAX_SO_STREAMS);
973 
974    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
975    radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
976    radeon_emit(cs, va);
977    radeon_emit(cs, va >> 32);
978 }
979 
980 static void
radv_alloc_shader_query_buf(struct radv_cmd_buffer * cmd_buffer)981 radv_alloc_shader_query_buf(struct radv_cmd_buffer *cmd_buffer)
982 {
983    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
984    const struct radv_physical_device *pdev = radv_device_physical(device);
985    unsigned offset;
986    void *ptr;
987 
988    assert(pdev->info.gfx_level >= GFX12);
989 
990    if (cmd_buffer->state.shader_query_buf_va)
991       return;
992 
993    if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, RADV_SHADER_QUERY_BUF_SIZE, 64, &offset, &ptr))
994       return;
995 
996    memset(ptr, 0, RADV_SHADER_QUERY_BUF_SIZE);
997 
998    cmd_buffer->state.shader_query_buf_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
999    cmd_buffer->state.shader_query_buf_va += offset;
1000 }
1001 
1002 static void
radv_begin_tfb_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)1003 radv_begin_tfb_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1004 {
1005    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1006    const struct radv_physical_device *pdev = radv_device_physical(device);
1007    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1008 
1009    if (pdev->use_ngg_streamout) {
1010       const bool uses_gds = pdev->info.gfx_level < GFX12;
1011 
1012       /* generated prim counter */
1013       gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
1014       radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
1015 
1016       /* written prim counter */
1017       gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 8);
1018       radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
1019 
1020       /* Record that the command buffer needs GDS. */
1021       cmd_buffer->gds_needed |= uses_gds;
1022 
1023       if (!cmd_buffer->state.active_emulated_prims_xfb_queries)
1024          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1025 
1026       cmd_buffer->state.active_emulated_prims_xfb_queries++;
1027    } else {
1028       cmd_buffer->state.active_prims_xfb_queries++;
1029 
1030       radv_update_hw_pipelinestat(cmd_buffer);
1031 
1032       emit_sample_streamout(cmd_buffer, va, index);
1033    }
1034 }
1035 
1036 static void
radv_end_tfb_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)1037 radv_end_tfb_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1038 {
1039    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1040    const struct radv_physical_device *pdev = radv_device_physical(device);
1041    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1042 
1043    if (pdev->use_ngg_streamout) {
1044       const bool uses_gds = pdev->info.gfx_level < GFX12;
1045 
1046       /* generated prim counter */
1047       gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
1048       radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
1049 
1050       /* written prim counter */
1051       gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 24);
1052       radv_cs_write_data_imm(cs, V_370_ME, va + 28, 0x80000000);
1053 
1054       cmd_buffer->state.active_emulated_prims_xfb_queries--;
1055 
1056       if (!cmd_buffer->state.active_emulated_prims_xfb_queries)
1057          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1058    } else {
1059       cmd_buffer->state.active_prims_xfb_queries--;
1060 
1061       radv_update_hw_pipelinestat(cmd_buffer);
1062 
1063       emit_sample_streamout(cmd_buffer, va + 16, index);
1064    }
1065 }
1066 
1067 static void
radv_copy_tfb_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)1068 radv_copy_tfb_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
1069                            uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset,
1070                            uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
1071 {
1072    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1073    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1074    uint64_t va = radv_buffer_get_va(pool->bo);
1075 
1076    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1077       for (unsigned i = 0; i < query_count; i++) {
1078          unsigned query = first_query + i;
1079          uint64_t src_va = va + query * pool->stride;
1080 
1081          radeon_check_space(device->ws, cs, 7 * 4);
1082 
1083          /* Wait on the upper word of all results. */
1084          for (unsigned j = 0; j < 4; j++, src_va += 8) {
1085             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1086          }
1087       }
1088    }
1089 
1090    radv_query_shader(cmd_buffer, VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT, pool->bo, dst_bo,
1091                      first_query * pool->stride, dst_offset, pool->stride, stride, dst_size, query_count, flags, 0, 0,
1092                      false);
1093 }
1094 
1095 /**
1096  * Timestamp query
1097  */
1098 static nir_shader *
build_timestamp_query_shader(struct radv_device * device)1099 build_timestamp_query_shader(struct radv_device *device)
1100 {
1101    /* the shader this builds is roughly
1102     *
1103     * uint32_t src_stride = 8;
1104     *
1105     * location(binding = 0) buffer dst_buf;
1106     * location(binding = 1) buffer src_buf;
1107     *
1108     * void main() {
1109     *	uint64_t result = 0;
1110     *	bool available = false;
1111     *	uint64_t src_offset = src_stride * global_id.x;
1112     * 	uint64_t dst_offset = dst_stride * global_id.x;
1113     * 	uint64_t timestamp = src_buf[src_offset];
1114     *	if (timestamp != TIMESTAMP_NOT_READY) {
1115     *		result = timestamp;
1116     *		available = true;
1117     *	}
1118     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
1119     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
1120     *		if (flags & VK_QUERY_RESULT_64_BIT) {
1121     *			dst_buf[dst_offset] = result;
1122     *		} else {
1123     *			dst_buf[dst_offset] = (uint32_t)result;
1124     *		}
1125     *	}
1126     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1127     *		dst_buf[dst_offset + result_size] = available;
1128     * 	}
1129     * }
1130     */
1131    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "timestamp_query");
1132    b.shader->info.workgroup_size[0] = 64;
1133 
1134    /* Create and initialize local variables. */
1135    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
1136    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
1137 
1138    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
1139    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
1140 
1141    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
1142 
1143    /* Load resources. */
1144    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
1145    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
1146 
1147    /* Compute global ID. */
1148    nir_def *global_id = get_global_ids(&b, 1);
1149 
1150    /* Compute src/dst strides. */
1151    nir_def *input_stride = nir_imm_int(&b, 8);
1152    nir_def *input_base = nir_imul(&b, input_stride, global_id);
1153    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
1154    nir_def *output_base = nir_imul(&b, output_stride, global_id);
1155 
1156    /* Load data from the query pool. */
1157    nir_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8);
1158 
1159    /* Pack the timestamp. */
1160    nir_def *timestamp;
1161    timestamp = nir_pack_64_2x32(&b, nir_trim_vector(&b, load, 2));
1162 
1163    /* Check if result is available. */
1164    nir_def *result_is_available = nir_i2b(&b, nir_ine_imm(&b, timestamp, TIMESTAMP_NOT_READY));
1165 
1166    /* Only store result if available. */
1167    nir_push_if(&b, result_is_available);
1168 
1169    nir_store_var(&b, result, timestamp, 0x1);
1170    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
1171 
1172    nir_pop_if(&b, NULL);
1173 
1174    /* Determine if result is 64 or 32 bit. */
1175    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
1176    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
1177 
1178    /* Store the result if complete or partial results have been requested. */
1179    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
1180 
1181    /* Store result. */
1182    nir_push_if(&b, result_is_64bit);
1183 
1184    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
1185 
1186    nir_push_else(&b, NULL);
1187 
1188    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
1189 
1190    nir_pop_if(&b, NULL);
1191 
1192    nir_pop_if(&b, NULL);
1193 
1194    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
1195                            nir_b2i32(&b, nir_load_var(&b, available)));
1196 
1197    return b.shader;
1198 }
1199 
1200 static void
radv_copy_timestamp_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)1201 radv_copy_timestamp_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
1202                                  uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset,
1203                                  uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
1204 {
1205    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1206    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1207    uint64_t va = radv_buffer_get_va(pool->bo);
1208 
1209    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1210       for (unsigned i = 0; i < query_count; ++i) {
1211          unsigned query = first_query + i;
1212          uint64_t local_src_va = va + query * pool->stride;
1213 
1214          radeon_check_space(device->ws, cs, 7);
1215 
1216          /* Wait on the high 32 bits of the timestamp in
1217           * case the low part is 0xffffffff.
1218           */
1219          radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4, TIMESTAMP_NOT_READY >> 32,
1220                           0xffffffff);
1221       }
1222    }
1223 
1224    radv_query_shader(cmd_buffer, VK_QUERY_TYPE_TIMESTAMP, pool->bo, dst_bo, first_query * pool->stride, dst_offset,
1225                      pool->stride, stride, dst_size, query_count, flags, 0, 0, false);
1226 }
1227 
1228 /**
1229  * Primitives generated query
1230  */
1231 #define RADV_PGQ_STRIDE     32
1232 #define RADV_PGQ_STRIDE_EMU (RADV_PGQ_STRIDE + 8 * 2)
1233 
1234 static nir_shader *
build_pg_query_shader(struct radv_device * device)1235 build_pg_query_shader(struct radv_device *device)
1236 {
1237    /* the shader this builds is roughly
1238     *
1239     * uint32_t src_stride = 32;
1240     *
1241     * location(binding = 0) buffer dst_buf;
1242     * location(binding = 1) buffer src_buf;
1243     *
1244     * void main() {
1245     *	uint64_t result = {};
1246     *	bool available = false;
1247     *	uint64_t src_offset = src_stride * global_id.x;
1248     * 	uint64_t dst_offset = dst_stride * global_id.x;
1249     * 	uint64_t *src_data = src_buf[src_offset];
1250     *	uint32_t avail = (src_data[0] >> 32) &
1251     *			 (src_data[2] >> 32);
1252     *	if (avail & 0x80000000) {
1253     *		result = src_data[2] - src_data[0];
1254     *	        if (use_emulated_queries) {
1255     *			uint32_t ngg_emu_result = 0;
1256     *			ngg_emu_result += src_data[9] - src_data[8];
1257     *			result += (uint64_t)ngg_emu_result;
1258     *	        }
1259     *		available = true;
1260     *	}
1261     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
1262     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
1263     *		if (flags & VK_QUERY_RESULT_64_BIT) {
1264     *			dst_buf[dst_offset] = result;
1265     *		} else {
1266     *			dst_buf[dst_offset] = (uint32_t)result;
1267     *		}
1268     *	}
1269     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1270     *		dst_buf[dst_offset + result_size] = available;
1271     * 	}
1272     * }
1273     */
1274    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pg_query");
1275    b.shader->info.workgroup_size[0] = 64;
1276 
1277    /* Create and initialize local variables. */
1278    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
1279    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
1280 
1281    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
1282    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
1283 
1284    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
1285 
1286    /* Load resources. */
1287    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
1288    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
1289 
1290    /* Compute global ID. */
1291    nir_def *global_id = get_global_ids(&b, 1);
1292 
1293    /* Determine if the query pool uses emulated queries for NGG. */
1294    nir_def *uses_emulated_queries = nir_i2b(&b, nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20));
1295 
1296    /* Compute src/dst strides. */
1297    nir_def *input_stride =
1298       nir_bcsel(&b, uses_emulated_queries, nir_imm_int(&b, RADV_PGQ_STRIDE_EMU), nir_imm_int(&b, RADV_PGQ_STRIDE));
1299    nir_def *input_base = nir_imul(&b, input_stride, global_id);
1300    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
1301    nir_def *output_base = nir_imul(&b, output_stride, global_id);
1302 
1303    /* Load data from the query pool. */
1304    nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
1305    nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16);
1306 
1307    /* Check if result is available. */
1308    nir_def *avails[2];
1309    avails[0] = nir_channel(&b, load1, 1);
1310    avails[1] = nir_channel(&b, load2, 1);
1311    nir_store_var(&b, available, nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000)), 0x1);
1312 
1313    nir_push_if(&b, uses_emulated_queries);
1314    {
1315       nir_def *emu_avail_start = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 36), .align_mul = 4);
1316       nir_def *emu_avail_end = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 44), .align_mul = 4);
1317       nir_def *emu_result_available =
1318          nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, emu_avail_start, emu_avail_end), 0x80000000));
1319 
1320       nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), emu_result_available), 0x1);
1321    }
1322    nir_pop_if(&b, NULL);
1323 
1324    /* Only compute result if available. */
1325    nir_push_if(&b, nir_load_var(&b, available));
1326 
1327    /* Pack values. */
1328    nir_def *packed64[2];
1329    packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
1330    packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
1331 
1332    /* Compute result. */
1333    nir_def *primitive_storage_needed = nir_isub(&b, packed64[1], packed64[0]);
1334 
1335    nir_store_var(&b, result, primitive_storage_needed, 0x1);
1336 
1337    nir_push_if(&b, uses_emulated_queries);
1338    {
1339       nir_def *emu_start =
1340          nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 4);
1341       nir_def *emu_end =
1342          nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 4);
1343 
1344       nir_def *ngg_emu_result = nir_isub(&b, emu_end, emu_start);
1345 
1346       nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), nir_u2u64(&b, ngg_emu_result)), 0x1);
1347    }
1348    nir_pop_if(&b, NULL);
1349 
1350    nir_pop_if(&b, NULL);
1351 
1352    /* Determine if result is 64 or 32 bit. */
1353    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
1354    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
1355 
1356    /* Store the result if complete or partial results have been requested. */
1357    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
1358 
1359    /* Store result. */
1360    nir_push_if(&b, result_is_64bit);
1361 
1362    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
1363 
1364    nir_push_else(&b, NULL);
1365 
1366    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
1367 
1368    nir_pop_if(&b, NULL);
1369    nir_pop_if(&b, NULL);
1370 
1371    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
1372                            nir_b2i32(&b, nir_load_var(&b, available)));
1373 
1374    return b.shader;
1375 }
1376 
1377 static void
radv_begin_pg_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint32_t index)1378 radv_begin_pg_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, uint32_t index)
1379 {
1380    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1381    const struct radv_physical_device *pdev = radv_device_physical(device);
1382    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1383 
1384    if (pdev->info.gfx_level >= GFX11) {
1385       const bool uses_gds = pdev->info.gfx_level < GFX12;
1386 
1387       /* On GFX11+, primitives generated query are always emulated. */
1388       gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
1389       radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
1390 
1391       /* Record that the command buffer needs GDS. */
1392       cmd_buffer->gds_needed |= uses_gds;
1393 
1394       if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1395          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1396 
1397       cmd_buffer->state.active_emulated_prims_gen_queries++;
1398    } else {
1399       if (!cmd_buffer->state.active_prims_gen_queries) {
1400          bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1401 
1402          cmd_buffer->state.active_prims_gen_queries++;
1403 
1404          if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1405             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
1406          }
1407       } else {
1408          cmd_buffer->state.active_prims_gen_queries++;
1409       }
1410 
1411       radv_update_hw_pipelinestat(cmd_buffer);
1412 
1413       if (pool->uses_emulated_queries) {
1414          /* generated prim counter */
1415          gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 32);
1416          radv_cs_write_data_imm(cs, V_370_ME, va + 36, 0x80000000);
1417 
1418          /* Record that the command buffer needs GDS. */
1419          cmd_buffer->gds_needed = true;
1420 
1421          if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1422             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1423 
1424          cmd_buffer->state.active_emulated_prims_gen_queries++;
1425       }
1426 
1427       emit_sample_streamout(cmd_buffer, va, index);
1428    }
1429 }
1430 
1431 static void
radv_end_pg_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint32_t index)1432 radv_end_pg_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, uint32_t index)
1433 {
1434    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1435    const struct radv_physical_device *pdev = radv_device_physical(device);
1436    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1437 
1438    if (pdev->info.gfx_level >= GFX11) {
1439       const bool uses_gds = pdev->info.gfx_level < GFX12;
1440 
1441       /* On GFX11+, primitives generated query are always emulated. */
1442       gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
1443       radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
1444 
1445       cmd_buffer->state.active_emulated_prims_gen_queries--;
1446 
1447       if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1448          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1449    } else {
1450       if (cmd_buffer->state.active_prims_gen_queries == 1) {
1451          bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1452 
1453          cmd_buffer->state.active_prims_gen_queries--;
1454 
1455          if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1456             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
1457          }
1458       } else {
1459          cmd_buffer->state.active_prims_gen_queries--;
1460       }
1461 
1462       radv_update_hw_pipelinestat(cmd_buffer);
1463 
1464       if (pool->uses_emulated_queries) {
1465          /* generated prim counter */
1466          gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 40);
1467          radv_cs_write_data_imm(cs, V_370_ME, va + 44, 0x80000000);
1468 
1469          cmd_buffer->state.active_emulated_prims_gen_queries--;
1470 
1471          if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1472             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1473       }
1474 
1475       emit_sample_streamout(cmd_buffer, va + 16, index);
1476    }
1477 }
1478 
1479 static void
radv_copy_pg_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)1480 radv_copy_pg_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
1481                           uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset, uint64_t dst_size,
1482                           uint64_t stride, VkQueryResultFlags flags)
1483 {
1484    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1485    const struct radv_physical_device *pdev = radv_device_physical(device);
1486    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1487    uint64_t va = radv_buffer_get_va(pool->bo);
1488 
1489    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1490       const bool uses_emulated_queries = pool->uses_emulated_queries && pdev->info.gfx_level < GFX11;
1491 
1492       for (unsigned i = 0; i < query_count; i++) {
1493          unsigned query = first_query + i;
1494          uint64_t src_va = va + query * pool->stride;
1495 
1496          radeon_check_space(device->ws, cs, 7 * 4);
1497 
1498          /* Wait on the upper word of the PrimitiveStorageNeeded result. */
1499          radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1500          radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
1501 
1502          if (uses_emulated_queries) {
1503             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 36, 0x80000000, 0xffffffff);
1504             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 44, 0x80000000, 0xffffffff);
1505          }
1506       }
1507    }
1508 
1509    radv_query_shader(cmd_buffer, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT, pool->bo, dst_bo, first_query * pool->stride,
1510                      dst_offset, pool->stride, stride, dst_size, query_count, flags, 0, 0,
1511                      pool->uses_emulated_queries && pdev->info.gfx_level < GFX11);
1512 }
1513 
1514 /**
1515  * Mesh primitives generated query
1516  */
1517 static nir_shader *
build_ms_prim_gen_query_shader(struct radv_device * device)1518 build_ms_prim_gen_query_shader(struct radv_device *device)
1519 {
1520    /* the shader this builds is roughly
1521     *
1522     * uint32_t src_stride = 32;
1523     *
1524     * location(binding = 0) buffer dst_buf;
1525     * location(binding = 1) buffer src_buf;
1526     *
1527     * void main() {
1528     *	uint64_t result = {};
1529     *	bool available = false;
1530     *	uint64_t src_offset = src_stride * global_id.x;
1531     * 	uint64_t dst_offset = dst_stride * global_id.x;
1532     * 	uint64_t *src_data = src_buf[src_offset];
1533     *	uint32_t avail = (src_data[0] >> 32) & (src_data[1] >> 32);
1534     *	if (avail & 0x80000000) {
1535     *		result = src_data[1] - src_data[0];
1536     *		available = true;
1537     *	}
1538     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
1539     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
1540     *		if (flags & VK_QUERY_RESULT_64_BIT) {
1541     *			dst_buf[dst_offset] = result;
1542     *		} else {
1543     *			dst_buf[dst_offset] = (uint32_t)result;
1544     *		}
1545     *	}
1546     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1547     *		dst_buf[dst_offset + result_size] = available;
1548     * 	}
1549     * }
1550     */
1551    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "ms_prim_gen_query");
1552    b.shader->info.workgroup_size[0] = 64;
1553 
1554    /* Create and initialize local variables. */
1555    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
1556    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
1557 
1558    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
1559    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
1560 
1561    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
1562 
1563    /* Load resources. */
1564    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
1565    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
1566 
1567    /* Compute global ID. */
1568    nir_def *global_id = get_global_ids(&b, 1);
1569 
1570    /* Compute src/dst strides. */
1571    nir_def *input_base = nir_imul_imm(&b, global_id, 16);
1572    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
1573    nir_def *output_base = nir_imul(&b, output_stride, global_id);
1574 
1575    /* Load data from the query pool. */
1576    nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
1577    nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd_imm(&b, input_base, 8), .align_mul = 16);
1578 
1579    /* Check if result is available. */
1580    nir_def *avails[2];
1581    avails[0] = nir_channel(&b, load1, 1);
1582    avails[1] = nir_channel(&b, load2, 1);
1583    nir_def *result_is_available = nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000));
1584 
1585    /* Only compute result if available. */
1586    nir_push_if(&b, result_is_available);
1587 
1588    /* Pack values. */
1589    nir_def *packed64[2];
1590    packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
1591    packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
1592 
1593    /* Compute result. */
1594    nir_def *ms_prim_gen = nir_isub(&b, packed64[1], packed64[0]);
1595 
1596    nir_store_var(&b, result, ms_prim_gen, 0x1);
1597 
1598    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
1599 
1600    nir_pop_if(&b, NULL);
1601 
1602    /* Determine if result is 64 or 32 bit. */
1603    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
1604    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
1605 
1606    /* Store the result if complete or partial results have been requested. */
1607    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
1608 
1609    /* Store result. */
1610    nir_push_if(&b, result_is_64bit);
1611 
1612    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
1613 
1614    nir_push_else(&b, NULL);
1615 
1616    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
1617 
1618    nir_pop_if(&b, NULL);
1619    nir_pop_if(&b, NULL);
1620 
1621    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
1622                            nir_b2i32(&b, nir_load_var(&b, available)));
1623 
1624    return b.shader;
1625 }
1626 
1627 static void
radv_begin_ms_prim_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va)1628 radv_begin_ms_prim_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
1629 {
1630    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1631    const struct radv_physical_device *pdev = radv_device_physical(device);
1632    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1633 
1634    if (pdev->info.gfx_level >= GFX11) {
1635       radeon_check_space(device->ws, cs, 4);
1636 
1637       ++cmd_buffer->state.active_pipeline_queries;
1638 
1639       radv_update_hw_pipelinestat(cmd_buffer);
1640 
1641       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1642       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1643       radeon_emit(cs, va);
1644       radeon_emit(cs, va >> 32);
1645    } else {
1646       gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va);
1647       radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
1648 
1649       /* Record that the command buffer needs GDS. */
1650       cmd_buffer->gds_needed = true;
1651 
1652       if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1653          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1654 
1655       cmd_buffer->state.active_emulated_prims_gen_queries++;
1656    }
1657 }
1658 
1659 static void
radv_end_ms_prim_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint64_t avail_va)1660 radv_end_ms_prim_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t avail_va)
1661 {
1662    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1663    const struct radv_physical_device *pdev = radv_device_physical(device);
1664    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1665 
1666    if (pdev->info.gfx_level >= GFX11) {
1667       unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1668 
1669       radeon_check_space(device->ws, cs, 16);
1670 
1671       cmd_buffer->state.active_pipeline_queries--;
1672 
1673       radv_update_hw_pipelinestat(cmd_buffer);
1674 
1675       va += pipelinestat_block_size;
1676 
1677       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1678       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1679       radeon_emit(cs, va);
1680       radeon_emit(cs, va >> 32);
1681 
1682       radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
1683                                    EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va);
1684    } else {
1685       gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va + 8);
1686       radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
1687 
1688       cmd_buffer->state.active_emulated_prims_gen_queries--;
1689 
1690       if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1691          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1692    }
1693 }
1694 
1695 static void
radv_copy_ms_prim_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)1696 radv_copy_ms_prim_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
1697                                uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset,
1698                                uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
1699 {
1700    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1701    const struct radv_physical_device *pdev = radv_device_physical(device);
1702    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1703    uint64_t va = radv_buffer_get_va(pool->bo);
1704 
1705    if (pdev->info.gfx_level >= GFX11) {
1706       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1707          for (unsigned i = 0; i < query_count; ++i) {
1708             unsigned query = first_query + i;
1709 
1710             radeon_check_space(device->ws, cs, 7);
1711 
1712             uint64_t avail_va = va + pool->availability_offset + 4 * query;
1713 
1714             /* This waits on the ME. All copies below are done on the ME */
1715             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1716          }
1717       }
1718 
1719       radv_query_shader(cmd_buffer, VK_QUERY_TYPE_PIPELINE_STATISTICS, pool->bo, dst_bo, first_query * pool->stride,
1720                         dst_offset, pool->stride, stride, dst_size, query_count, flags, 1 << 13,
1721                         pool->availability_offset + 4 * first_query, false);
1722    } else {
1723       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1724          for (unsigned i = 0; i < query_count; i++) {
1725             unsigned query = first_query + i;
1726             uint64_t src_va = va + query * pool->stride;
1727 
1728             radeon_check_space(device->ws, cs, 7 * 2);
1729 
1730             /* Wait on the upper word. */
1731             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1732             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 12, 0x80000000, 0xffffffff);
1733          }
1734       }
1735 
1736       radv_query_shader(cmd_buffer, VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT, pool->bo, dst_bo,
1737                         first_query * pool->stride, dst_offset, pool->stride, stride, dst_size, query_count, flags, 0,
1738                         0, false);
1739    }
1740 }
1741 
1742 static VkResult
create_layout(struct radv_device * device,VkPipelineLayout * layout_out)1743 create_layout(struct radv_device *device, VkPipelineLayout *layout_out)
1744 {
1745    enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_QUERY;
1746 
1747    const VkDescriptorSetLayoutBinding bindings[] = {
1748       {.binding = 0,
1749        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1750        .descriptorCount = 1,
1751        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT},
1752       {
1753          .binding = 1,
1754          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1755          .descriptorCount = 1,
1756          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1757       },
1758    };
1759 
1760    const VkDescriptorSetLayoutCreateInfo desc_info = {
1761       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
1762       .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT,
1763       .bindingCount = 2,
1764       .pBindings = bindings,
1765    };
1766 
1767    const VkPushConstantRange pc_range = {
1768       .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1769       .size = 20,
1770    };
1771 
1772    return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key, sizeof(key),
1773                                       layout_out);
1774 }
1775 
1776 static VkResult
get_pipeline(struct radv_device * device,VkQueryType query_type,VkPipeline * pipeline_out,VkPipelineLayout * layout_out)1777 get_pipeline(struct radv_device *device, VkQueryType query_type, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
1778 {
1779    enum radv_meta_object_key_type key = 0;
1780    VkResult result;
1781    nir_shader *cs;
1782 
1783    result = create_layout(device, layout_out);
1784    if (result != VK_SUCCESS)
1785       return result;
1786 
1787    switch (query_type) {
1788    case VK_QUERY_TYPE_OCCLUSION:
1789       key = RADV_META_OBJECT_KEY_QUERY_OCCLUSION;
1790       break;
1791    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1792       key = RADV_META_OBJECT_KEY_QUERY_PIPELINE_STATS;
1793       break;
1794    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1795       key = RADV_META_OBJECT_KEY_QUERY_TFB;
1796       break;
1797    case VK_QUERY_TYPE_TIMESTAMP:
1798       key = RADV_META_OBJECT_KEY_QUERY_TIMESTAMP;
1799       break;
1800    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1801       key = RADV_META_OBJECT_KEY_QUERY_PRIMS_GEN;
1802       break;
1803    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1804       key = RADV_META_OBJECT_KEY_QUERY_MESH_PRIMS_GEN;
1805       break;
1806    default:
1807       unreachable("invalid query type");
1808    }
1809 
1810    VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
1811    if (pipeline_from_cache != VK_NULL_HANDLE) {
1812       *pipeline_out = pipeline_from_cache;
1813       return VK_SUCCESS;
1814    }
1815 
1816    switch (query_type) {
1817    case VK_QUERY_TYPE_OCCLUSION:
1818       cs = build_occlusion_query_shader(device);
1819       break;
1820    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1821       cs = build_pipeline_statistics_query_shader(device);
1822       break;
1823    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1824       cs = build_tfb_query_shader(device);
1825       break;
1826    case VK_QUERY_TYPE_TIMESTAMP:
1827       cs = build_timestamp_query_shader(device);
1828       break;
1829    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1830       cs = build_pg_query_shader(device);
1831       break;
1832    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1833       cs = build_ms_prim_gen_query_shader(device);
1834       break;
1835    default:
1836       unreachable("invalid query type");
1837    }
1838 
1839    const VkPipelineShaderStageCreateInfo stage_info = {
1840       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1841       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1842       .module = vk_shader_module_handle_from_nir(cs),
1843       .pName = "main",
1844       .pSpecializationInfo = NULL,
1845    };
1846 
1847    const VkComputePipelineCreateInfo pipeline_info = {
1848       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1849       .stage = stage_info,
1850       .flags = 0,
1851       .layout = *layout_out,
1852    };
1853 
1854    result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
1855                                             pipeline_out);
1856 
1857    ralloc_free(cs);
1858    return result;
1859 }
1860 
1861 static void
radv_query_shader(struct radv_cmd_buffer * cmd_buffer,VkQueryType query_type,struct radeon_winsys_bo * src_bo,struct radeon_winsys_bo * dst_bo,uint64_t src_offset,uint64_t dst_offset,uint32_t src_stride,uint32_t dst_stride,size_t dst_size,uint32_t count,uint32_t flags,uint32_t pipeline_stats_mask,uint32_t avail_offset,bool uses_emulated_queries)1862 radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkQueryType query_type, struct radeon_winsys_bo *src_bo,
1863                   struct radeon_winsys_bo *dst_bo, uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride,
1864                   uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags, uint32_t pipeline_stats_mask,
1865                   uint32_t avail_offset, bool uses_emulated_queries)
1866 {
1867    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1868    struct radv_meta_saved_state saved_state;
1869    struct radv_buffer src_buffer, dst_buffer;
1870    VkPipelineLayout layout;
1871    VkPipeline pipeline;
1872    VkResult result;
1873 
1874    result = get_pipeline(device, query_type, &pipeline, &layout);
1875    if (result != VK_SUCCESS) {
1876       vk_command_buffer_set_error(&cmd_buffer->vk, result);
1877       return;
1878    }
1879 
1880    /* VK_EXT_conditional_rendering says that copy commands should not be
1881     * affected by conditional rendering.
1882     */
1883    radv_meta_save(&saved_state, cmd_buffer,
1884                   RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS |
1885                      RADV_META_SUSPEND_PREDICATING);
1886 
1887    uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset);
1888    uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size;
1889 
1890    radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset);
1891    radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset);
1892 
1893    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
1894 
1895    radv_meta_push_descriptor_set(
1896       cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, 2,
1897       (VkWriteDescriptorSet[]){{.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1898                                 .dstBinding = 0,
1899                                 .dstArrayElement = 0,
1900                                 .descriptorCount = 1,
1901                                 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1902                                 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer),
1903                                                                          .offset = 0,
1904                                                                          .range = VK_WHOLE_SIZE}},
1905                                {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1906                                 .dstBinding = 1,
1907                                 .dstArrayElement = 0,
1908                                 .descriptorCount = 1,
1909                                 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1910                                 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer),
1911                                                                          .offset = 0,
1912                                                                          .range = VK_WHOLE_SIZE}}});
1913 
1914    /* Encode the number of elements for easy access by the shader. */
1915    pipeline_stats_mask &= (1 << (radv_get_pipelinestat_query_size(device) / 8)) - 1;
1916    pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1917 
1918    avail_offset -= src_offset;
1919 
1920    struct {
1921       uint32_t flags;
1922       uint32_t dst_stride;
1923       uint32_t pipeline_stats_mask;
1924       uint32_t avail_offset;
1925       uint32_t uses_emulated_queries;
1926    } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset, uses_emulated_queries};
1927 
1928    vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), layout, VK_SHADER_STAGE_COMPUTE_BIT, 0,
1929                               sizeof(push_constants), &push_constants);
1930 
1931    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1932 
1933    if (flags & VK_QUERY_RESULT_WAIT_BIT)
1934       cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1935 
1936    radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1937 
1938    /* Ensure that the query copy dispatch is complete before a potential vkCmdResetPool because
1939     * there is an implicit execution dependency from each such query command to all query commands
1940     * previously submitted to the same queue.
1941     */
1942    cmd_buffer->active_query_flush_bits |=
1943       RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1944 
1945    radv_buffer_finish(&src_buffer);
1946    radv_buffer_finish(&dst_buffer);
1947 
1948    radv_meta_restore(&saved_state, cmd_buffer);
1949 }
1950 
1951 static void
radv_destroy_query_pool(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_query_pool * pool)1952 radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
1953                         struct radv_query_pool *pool)
1954 {
1955    if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
1956       radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool);
1957 
1958    if (pool->bo)
1959       radv_bo_destroy(device, &pool->vk.base, pool->bo);
1960 
1961    radv_rmv_log_resource_destroy(device, (uint64_t)radv_query_pool_to_handle(pool));
1962    vk_query_pool_finish(&pool->vk);
1963    vk_free2(&device->vk.alloc, pAllocator, pool);
1964 }
1965 
1966 static VkResult
radv_create_query_pool(struct radv_device * device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)1967 radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo *pCreateInfo,
1968                        const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
1969 {
1970    struct radv_physical_device *pdev = radv_device_physical(device);
1971    VkResult result;
1972    size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
1973                                 ? sizeof(struct radv_pc_query_pool)
1974                                 : sizeof(struct radv_query_pool);
1975 
1976    struct radv_query_pool *pool =
1977       vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1978 
1979    if (!pool)
1980       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1981 
1982    vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo);
1983 
1984    /* The number of primitives generated by geometry shader invocations is only counted by the
1985     * hardware if GS uses the legacy path. When NGG GS is used, the hardware can't know the number
1986     * of generated primitives and we have to increment it from the shader using a plain GDS atomic.
1987     */
1988    pool->uses_emulated_queries =
1989       (pdev->emulate_ngg_gs_query_pipeline_stat &&
1990        (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)) ||
1991       (pdev->use_ngg && pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) ||
1992       (pdev->emulate_mesh_shader_queries &&
1993        (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT ||
1994         pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT));
1995 
1996    /* The number of task shader invocations needs to be queried on ACE. */
1997    pool->uses_ace = (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1998 
1999    pool->uses_shader_query_buf =
2000       pdev->info.gfx_level >= GFX12 && (pool->vk.query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT ||
2001                                         pool->vk.query_type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
2002 
2003    switch (pCreateInfo->queryType) {
2004    case VK_QUERY_TYPE_OCCLUSION:
2005       pool->stride = 16 * pdev->info.max_render_backends;
2006       break;
2007    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2008       pool->stride = radv_get_pipelinestat_query_size(device) * 2;
2009       if (pool->uses_emulated_queries) {
2010          /* When the query pool uses emulated queries (for counting the number of primitives
2011           * generated by a geometry shader with NGG), allocate 2x64-bit values for begin/end.
2012           */
2013          pool->stride += 8 * 2;
2014       }
2015       break;
2016    case VK_QUERY_TYPE_TIMESTAMP:
2017    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2018    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2019    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2020    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2021       pool->stride = 8;
2022       break;
2023    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2024       pool->stride = 32;
2025       break;
2026    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2027       if (pool->uses_emulated_queries && pdev->info.gfx_level < GFX11) {
2028          /* When the hardware can use both the legacy and the NGG paths in the same begin/end pair,
2029           * allocate 2x64-bit values for the emulated counters.
2030           */
2031          pool->stride = RADV_PGQ_STRIDE_EMU;
2032       } else {
2033          pool->stride = RADV_PGQ_STRIDE;
2034       }
2035       break;
2036    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2037       result = radv_pc_init_query_pool(pdev, pCreateInfo, (struct radv_pc_query_pool *)pool);
2038 
2039       if (result != VK_SUCCESS) {
2040          radv_destroy_query_pool(device, pAllocator, pool);
2041          return vk_error(device, result);
2042       }
2043       break;
2044    }
2045    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2046       if (pdev->info.gfx_level >= GFX11) {
2047          /* GFX11 natively supports mesh generated primitives with pipeline statistics. */
2048          pool->stride = radv_get_pipelinestat_query_size(device) * 2;
2049       } else {
2050          assert(pdev->emulate_mesh_shader_queries);
2051          pool->stride = 16;
2052       }
2053       break;
2054    case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
2055       pool->stride = 48;
2056       break;
2057    default:
2058       unreachable("creating unhandled query type");
2059    }
2060 
2061    pool->availability_offset = pool->stride * pCreateInfo->queryCount;
2062    pool->size = pool->availability_offset;
2063    if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
2064        (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && pdev->info.gfx_level >= GFX11))
2065       pool->size += 4 * pCreateInfo->queryCount;
2066 
2067    result = radv_bo_create(device, &pool->vk.base, pool->size, 64, RADEON_DOMAIN_GTT,
2068                            RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_QUERY_POOL, 0, false, &pool->bo);
2069    if (result != VK_SUCCESS) {
2070       radv_destroy_query_pool(device, pAllocator, pool);
2071       return vk_error(device, result);
2072    }
2073 
2074    pool->ptr = radv_buffer_map(device->ws, pool->bo);
2075    if (!pool->ptr) {
2076       radv_destroy_query_pool(device, pAllocator, pool);
2077       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
2078    }
2079 
2080    *pQueryPool = radv_query_pool_to_handle(pool);
2081    radv_rmv_log_query_pool_create(device, *pQueryPool);
2082    return VK_SUCCESS;
2083 }
2084 
2085 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)2086 radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
2087                      const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
2088 {
2089    VK_FROM_HANDLE(radv_device, device, _device);
2090    return radv_create_query_pool(device, pCreateInfo, pAllocator, pQueryPool);
2091 }
2092 
2093 VKAPI_ATTR void VKAPI_CALL
radv_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)2094 radv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator)
2095 {
2096    VK_FROM_HANDLE(radv_device, device, _device);
2097    VK_FROM_HANDLE(radv_query_pool, pool, _pool);
2098 
2099    if (!pool)
2100       return;
2101 
2102    radv_destroy_query_pool(device, pAllocator, pool);
2103 }
2104 
2105 static inline uint64_t
radv_get_rel_timeout_for_query(VkQueryType type)2106 radv_get_rel_timeout_for_query(VkQueryType type)
2107 {
2108    /*
2109     * Certain queries are only possible on certain types of queues
2110     * so pick the TDR timeout of the highest possible type
2111     * and double it to ensure GetQueryPoolResults completes in finite-time.
2112     *
2113     * (compute has longer TDR than gfx, other rings)
2114     */
2115    switch (type) {
2116    case VK_QUERY_TYPE_OCCLUSION:
2117    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2118    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2119    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2120       return radv_get_tdr_timeout_for_ip(AMD_IP_GFX) * 2;
2121    default:
2122       return radv_get_tdr_timeout_for_ip(AMD_IP_COMPUTE) * 2;
2123    }
2124 }
2125 
2126 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)2127 radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount,
2128                          size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags)
2129 {
2130    VK_FROM_HANDLE(radv_device, device, _device);
2131    VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2132    const struct radv_physical_device *pdev = radv_device_physical(device);
2133    char *data = pData;
2134    VkResult result = VK_SUCCESS;
2135 
2136    if (vk_device_is_lost(&device->vk))
2137       return VK_ERROR_DEVICE_LOST;
2138 
2139    for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) {
2140       char *dest = data;
2141       unsigned query = firstQuery + query_idx;
2142       char *src = pool->ptr + query * pool->stride;
2143       uint32_t available;
2144       bool timed_out = false;
2145       uint64_t atimeout = os_time_get_absolute_timeout(radv_get_rel_timeout_for_query(pool->vk.query_type));
2146 
2147       switch (pool->vk.query_type) {
2148       case VK_QUERY_TYPE_TIMESTAMP:
2149       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2150       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2151       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2152       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: {
2153          p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2154          uint64_t value;
2155 
2156          do {
2157             value = p_atomic_read(&src64->value);
2158          } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
2159                   !(timed_out = (atimeout < os_time_get_nano())));
2160 
2161          available = value != TIMESTAMP_NOT_READY;
2162 
2163          if (timed_out)
2164             result = VK_ERROR_DEVICE_LOST;
2165          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2166             result = VK_NOT_READY;
2167 
2168          if (flags & VK_QUERY_RESULT_64_BIT) {
2169             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2170                *(uint64_t *)dest = value;
2171             dest += 8;
2172          } else {
2173             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2174                *(uint32_t *)dest = (uint32_t)value;
2175             dest += 4;
2176          }
2177          break;
2178       }
2179       case VK_QUERY_TYPE_OCCLUSION: {
2180          p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2181          uint32_t db_count = pdev->info.max_render_backends;
2182          uint64_t enabled_rb_mask = pdev->info.enabled_rb_mask;
2183          uint64_t sample_count = 0;
2184          available = 1;
2185 
2186          for (int i = 0; i < db_count; ++i) {
2187             uint64_t start, end;
2188 
2189             if (!(enabled_rb_mask & (1ull << i)))
2190                continue;
2191 
2192             do {
2193                start = p_atomic_read(&src64[2 * i].value);
2194                end = p_atomic_read(&src64[2 * i + 1].value);
2195             } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
2196                      !(timed_out = (atimeout < os_time_get_nano())));
2197 
2198             if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
2199                available = 0;
2200             else {
2201                sample_count += end - start;
2202             }
2203          }
2204 
2205          if (timed_out)
2206             result = VK_ERROR_DEVICE_LOST;
2207          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2208             result = VK_NOT_READY;
2209 
2210          if (flags & VK_QUERY_RESULT_64_BIT) {
2211             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2212                *(uint64_t *)dest = sample_count;
2213             dest += 8;
2214          } else {
2215             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2216                *(uint32_t *)dest = sample_count;
2217             dest += 4;
2218          }
2219          break;
2220       }
2221       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
2222          unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
2223          const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
2224          uint64_t ngg_emu_result = 0;
2225 
2226          do {
2227             available = p_atomic_read(avail_ptr);
2228 
2229             if (pool->uses_ace && pdev->emulate_mesh_shader_queries) {
2230                const uint32_t task_invoc_offset =
2231                   radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
2232                const uint32_t *avail_ptr_start = (const uint32_t *)(src + task_invoc_offset + 4);
2233                const uint32_t *avail_ptr_stop =
2234                   (const uint32_t *)(src + pipelinestat_block_size + task_invoc_offset + 4);
2235 
2236                if (!(p_atomic_read(avail_ptr_start) & 0x80000000) || !(p_atomic_read(avail_ptr_stop) & 0x80000000))
2237                   available = 0;
2238             }
2239          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
2240 
2241          if (timed_out)
2242             result = VK_ERROR_DEVICE_LOST;
2243          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2244             result = VK_NOT_READY;
2245 
2246          if (pool->uses_emulated_queries) {
2247             /* Compute the result that was copied from emu. */
2248             const uint64_t *emu_start = (uint64_t *)(src + pipelinestat_block_size * 2);
2249             const uint64_t *emu_stop = (uint64_t *)(src + pipelinestat_block_size * 2 + 8);
2250 
2251             ngg_emu_result = emu_stop[0] - emu_start[0];
2252          }
2253 
2254          const uint64_t *start = (uint64_t *)src;
2255          const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
2256          if (flags & VK_QUERY_RESULT_64_BIT) {
2257             uint64_t *dst = (uint64_t *)dest;
2258             dest += util_bitcount(pool->vk.pipeline_statistics) * 8;
2259             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
2260                if (pool->vk.pipeline_statistics & (1u << i)) {
2261                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
2262                      *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
2263 
2264                      if (pool->uses_emulated_queries &&
2265                          (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)
2266                         *dst += ngg_emu_result;
2267                   }
2268                   dst++;
2269                }
2270             }
2271 
2272          } else {
2273             uint32_t *dst = (uint32_t *)dest;
2274             dest += util_bitcount(pool->vk.pipeline_statistics) * 4;
2275             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
2276                if (pool->vk.pipeline_statistics & (1u << i)) {
2277                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
2278                      *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
2279 
2280                      if (pool->uses_emulated_queries &&
2281                          (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)
2282                         *dst += ngg_emu_result;
2283                   }
2284                   dst++;
2285                }
2286             }
2287          }
2288          break;
2289       }
2290       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
2291          p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2292          uint64_t num_primitives_written;
2293          uint64_t primitive_storage_needed;
2294 
2295          /* SAMPLE_STREAMOUTSTATS stores this structure:
2296           * {
2297           *	u64 NumPrimitivesWritten;
2298           *	u64 PrimitiveStorageNeeded;
2299           * }
2300           */
2301          do {
2302             available = 1;
2303             for (int j = 0; j < 4; j++) {
2304                if (!(p_atomic_read(&src64[j].value) & 0x8000000000000000UL))
2305                   available = 0;
2306             }
2307          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
2308 
2309          if (timed_out)
2310             result = VK_ERROR_DEVICE_LOST;
2311          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2312             result = VK_NOT_READY;
2313 
2314          num_primitives_written = p_atomic_read_relaxed(&src64[3].value) - p_atomic_read_relaxed(&src64[1].value);
2315          primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
2316 
2317          if (flags & VK_QUERY_RESULT_64_BIT) {
2318             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2319                *(uint64_t *)dest = num_primitives_written;
2320             dest += 8;
2321             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2322                *(uint64_t *)dest = primitive_storage_needed;
2323             dest += 8;
2324          } else {
2325             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2326                *(uint32_t *)dest = num_primitives_written;
2327             dest += 4;
2328             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2329                *(uint32_t *)dest = primitive_storage_needed;
2330             dest += 4;
2331          }
2332          break;
2333       }
2334       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
2335          const bool uses_emulated_queries = pool->uses_emulated_queries && pdev->info.gfx_level < GFX11;
2336          p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2337          uint64_t primitive_storage_needed;
2338 
2339          /* SAMPLE_STREAMOUTSTATS stores this structure:
2340           * {
2341           *	u64 NumPrimitivesWritten;
2342           *	u64 PrimitiveStorageNeeded;
2343           * }
2344           */
2345          do {
2346             available = 1;
2347             if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
2348                 !(p_atomic_read(&src64[2].value) & 0x8000000000000000UL)) {
2349                available = 0;
2350             }
2351             if (uses_emulated_queries && (!(p_atomic_read(&src64[4].value) & 0x8000000000000000UL) ||
2352                                           !(p_atomic_read(&src64[5].value) & 0x8000000000000000UL))) {
2353                available = 0;
2354             }
2355          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
2356 
2357          if (timed_out)
2358             result = VK_ERROR_DEVICE_LOST;
2359          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2360             result = VK_NOT_READY;
2361 
2362          primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
2363 
2364          if (uses_emulated_queries) {
2365             /* Accumulate the result that was copied from the emulated queries in case NGG shader has been used. */
2366             primitive_storage_needed += p_atomic_read_relaxed(&src64[5].value) - p_atomic_read_relaxed(&src64[4].value);
2367          }
2368 
2369          if (flags & VK_QUERY_RESULT_64_BIT) {
2370             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2371                *(uint64_t *)dest = primitive_storage_needed;
2372             dest += 8;
2373          } else {
2374             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2375                *(uint32_t *)dest = primitive_storage_needed;
2376             dest += 4;
2377          }
2378          break;
2379       }
2380       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2381          struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool;
2382          const p_atomic_uint64_t *src64 = (const p_atomic_uint64_t *)src;
2383          bool avail;
2384          do {
2385             avail = true;
2386             for (unsigned i = 0; i < pc_pool->num_passes; ++i)
2387                if (!p_atomic_read(&src64[pool->stride / 8 - i - 1].value))
2388                   avail = false;
2389          } while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
2390 
2391          available = avail;
2392 
2393          radv_pc_get_results(pc_pool, &src64->value, dest);
2394          dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR);
2395          break;
2396       }
2397       case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
2398          uint64_t ms_prim_gen;
2399 
2400          if (pdev->info.gfx_level >= GFX11) {
2401             unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
2402             const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
2403 
2404             do {
2405                available = p_atomic_read(avail_ptr);
2406             } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
2407                      !(timed_out = (atimeout < os_time_get_nano())));
2408 
2409             const uint64_t *start = (uint64_t *)src;
2410             const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
2411 
2412             ms_prim_gen = stop[pipeline_statistics_indices[13]] - start[pipeline_statistics_indices[13]];
2413          } else {
2414             p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2415 
2416             do {
2417                available = 1;
2418                if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
2419                    !(p_atomic_read(&src64[1].value) & 0x8000000000000000UL)) {
2420                   available = 0;
2421                }
2422             } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
2423                      !(timed_out = (atimeout < os_time_get_nano())));
2424 
2425             ms_prim_gen = p_atomic_read_relaxed(&src64[1].value) - p_atomic_read_relaxed(&src64[0].value);
2426          }
2427 
2428          if (timed_out)
2429             result = VK_ERROR_DEVICE_LOST;
2430          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2431             result = VK_NOT_READY;
2432 
2433          if (flags & VK_QUERY_RESULT_64_BIT) {
2434             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2435                *(uint64_t *)dest = ms_prim_gen;
2436             dest += 8;
2437          } else {
2438             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2439                *(uint32_t *)dest = ms_prim_gen;
2440             dest += 4;
2441          }
2442          break;
2443       }
2444       case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR: {
2445          uint32_t *src32 = (uint32_t *)src;
2446          uint32_t value;
2447          do {
2448             value = p_atomic_read(&src32[1]);
2449          } while (value != 1 && (flags & VK_QUERY_RESULT_WAIT_BIT));
2450 
2451          available = value != 0;
2452 
2453          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2454             result = VK_NOT_READY;
2455 
2456          if (flags & VK_QUERY_RESULT_64_BIT) {
2457             uint64_t *dest64 = (uint64_t *)dest;
2458             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
2459                dest64[0] = src32[5];
2460                dest64[1] = src32[6];
2461             }
2462             dest += 16;
2463             if (flags & VK_QUERY_RESULT_WITH_STATUS_BIT_KHR) {
2464                dest64[2] = 1;
2465                dest += 8;
2466             }
2467          } else {
2468             uint32_t *dest32 = (uint32_t *)dest;
2469             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
2470                dest32[0] = src32[5];
2471                dest32[1] = src32[6];
2472             }
2473             dest += 8;
2474             if (flags & VK_QUERY_RESULT_WITH_STATUS_BIT_KHR) {
2475                dest32[2] = 1;
2476                dest += 4;
2477             }
2478          }
2479          break;
2480       }
2481       default:
2482          unreachable("trying to get results of unhandled query type");
2483       }
2484 
2485       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
2486          if (flags & VK_QUERY_RESULT_64_BIT) {
2487             *(uint64_t *)dest = available;
2488          } else {
2489             *(uint32_t *)dest = available;
2490          }
2491       }
2492    }
2493 
2494    if (result == VK_ERROR_DEVICE_LOST)
2495       vk_device_set_lost(&device->vk, "GetQueryPoolResults timed out");
2496 
2497    return result;
2498 }
2499 
2500 static void
emit_query_flush(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool)2501 emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool)
2502 {
2503    if (cmd_buffer->pending_reset_query) {
2504       if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
2505          /* Only need to flush caches if the query pool size is
2506           * large enough to be reset using the compute shader
2507           * path. Small pools don't need any cache flushes
2508           * because we use a CP dma clear.
2509           */
2510          radv_emit_cache_flush(cmd_buffer);
2511       }
2512    }
2513 }
2514 
2515 VKAPI_ATTR void VKAPI_CALL
radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)2516 radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
2517                              uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride,
2518                              VkQueryResultFlags flags)
2519 {
2520    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2521    VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2522    VK_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
2523    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2524    const struct radv_physical_device *pdev = radv_device_physical(device);
2525    const struct radv_instance *instance = radv_physical_device_instance(pdev);
2526    const uint64_t dst_offset = dst_buffer->offset + dstOffset;
2527    const size_t dst_size = radv_query_result_size(pool, flags);
2528 
2529    if (!queryCount)
2530       return;
2531 
2532    radv_cs_add_buffer(device->ws, cmd_buffer->cs, pool->bo);
2533    radv_cs_add_buffer(device->ws, cmd_buffer->cs, dst_buffer->bo);
2534 
2535    /* Workaround engines that forget to properly specify WAIT_BIT because some driver implicitly
2536     * synchronizes before query copy.
2537     */
2538    if (instance->drirc.flush_before_query_copy)
2539       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
2540 
2541    /* From the Vulkan spec 1.1.108:
2542     *
2543     * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
2544     *  previous uses of vkCmdResetQueryPool in the same queue, without any
2545     *  additional synchronization."
2546     *
2547     * So, we have to flush the caches if the compute shader path was used.
2548     */
2549    emit_query_flush(cmd_buffer, pool);
2550 
2551    switch (pool->vk.query_type) {
2552    case VK_QUERY_TYPE_OCCLUSION:
2553       radv_copy_occlusion_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size,
2554                                        stride, flags);
2555       break;
2556    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2557       radv_copy_pipeline_stat_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset,
2558                                            dst_size, stride, flags);
2559       break;
2560    case VK_QUERY_TYPE_TIMESTAMP:
2561    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2562    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2563    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2564    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2565       radv_copy_timestamp_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size,
2566                                        stride, flags);
2567       break;
2568    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2569       radv_copy_tfb_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size, stride,
2570                                  flags);
2571       break;
2572    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2573       radv_copy_pg_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size, stride,
2574                                 flags);
2575       break;
2576    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2577       radv_copy_ms_prim_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size,
2578                                      stride, flags);
2579       break;
2580    default:
2581       unreachable("trying to get results of unhandled query type");
2582    }
2583 }
2584 
2585 static uint32_t
query_clear_value(VkQueryType type)2586 query_clear_value(VkQueryType type)
2587 {
2588    switch (type) {
2589    case VK_QUERY_TYPE_TIMESTAMP:
2590    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2591    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2592    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2593    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2594       return (uint32_t)TIMESTAMP_NOT_READY;
2595    default:
2596       return 0;
2597    }
2598 }
2599 
2600 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)2601 radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
2602 {
2603    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2604    VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2605    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2606    const struct radv_physical_device *pdev = radv_device_physical(device);
2607    uint32_t value = query_clear_value(pool->vk.query_type);
2608    uint32_t flush_bits = 0;
2609 
2610    if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC || cmd_buffer->qf == RADV_QUEUE_VIDEO_ENC)
2611       /* video queries don't work like this */
2612       return;
2613 
2614    /* Make sure to sync all previous work if the given command buffer has
2615     * pending active queries. Otherwise the GPU might write queries data
2616     * after the reset operation.
2617     */
2618    cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
2619 
2620    flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, radv_buffer_get_va(pool->bo) + firstQuery * pool->stride,
2621                                   queryCount * pool->stride, value);
2622 
2623    if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
2624        (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && pdev->info.gfx_level >= GFX11)) {
2625       flush_bits |=
2626          radv_fill_buffer(cmd_buffer, NULL, pool->bo,
2627                           radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4, queryCount * 4, 0);
2628    }
2629 
2630    if (flush_bits) {
2631       /* Only need to flush caches for the compute shader path. */
2632       cmd_buffer->pending_reset_query = true;
2633       cmd_buffer->state.flush_bits |= flush_bits;
2634    }
2635 }
2636 
2637 VKAPI_ATTR void VKAPI_CALL
radv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)2638 radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
2639 {
2640    VK_FROM_HANDLE(radv_device, device, _device);
2641    VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2642    const struct radv_physical_device *pdev = radv_device_physical(device);
2643 
2644    uint32_t value = query_clear_value(pool->vk.query_type);
2645    uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride);
2646    uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride);
2647 
2648    for (uint32_t *p = data; p != data_end; ++p)
2649       *p = value;
2650 
2651    if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
2652        (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && pdev->info.gfx_level >= GFX11)) {
2653       memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4);
2654    }
2655 }
2656 
2657 static void
emit_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,VkQueryType query_type,VkQueryControlFlags flags,uint32_t index)2658 emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, VkQueryType query_type,
2659                  VkQueryControlFlags flags, uint32_t index)
2660 {
2661    switch (query_type) {
2662    case VK_QUERY_TYPE_OCCLUSION:
2663       radv_begin_occlusion_query(cmd_buffer, va, flags);
2664       break;
2665    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2666       radv_begin_pipeline_stat_query(cmd_buffer, pool, va);
2667       break;
2668    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2669       radv_begin_tfb_query(cmd_buffer, va, index);
2670       break;
2671    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2672       radv_begin_pg_query(cmd_buffer, pool, va, index);
2673       break;
2674    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
2675       radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2676       break;
2677    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2678       radv_begin_ms_prim_query(cmd_buffer, va);
2679       break;
2680    case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
2681       cmd_buffer->video.feedback_query_va = va;
2682       break;
2683    default:
2684       unreachable("beginning unhandled query type");
2685    }
2686 }
2687 
2688 static void
emit_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va,VkQueryType query_type,uint32_t index)2689 emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, uint64_t avail_va,
2690                VkQueryType query_type, uint32_t index)
2691 {
2692    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2693    const struct radv_physical_device *pdev = radv_device_physical(device);
2694 
2695    switch (query_type) {
2696    case VK_QUERY_TYPE_OCCLUSION:
2697       radv_end_occlusion_query(cmd_buffer, va);
2698       break;
2699    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2700       radv_end_pipeline_stat_query(cmd_buffer, pool, va, avail_va);
2701       break;
2702    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2703       radv_end_tfb_query(cmd_buffer, va, index);
2704       break;
2705    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2706       radv_end_pg_query(cmd_buffer, pool, va, index);
2707       break;
2708    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
2709       radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2710       break;
2711    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2712       radv_end_ms_prim_query(cmd_buffer, va, avail_va);
2713       break;
2714    case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
2715       cmd_buffer->video.feedback_query_va = 0;
2716       break;
2717    default:
2718       unreachable("ending unhandled query type");
2719    }
2720 
2721    cmd_buffer->active_query_flush_bits |=
2722       RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2723    if (pdev->info.gfx_level >= GFX9) {
2724       cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2725    }
2726 }
2727 
2728 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)2729 radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
2730                              VkQueryControlFlags flags, uint32_t index)
2731 {
2732    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2733    VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2734    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2735    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2736    uint64_t va = radv_buffer_get_va(pool->bo);
2737 
2738    radv_cs_add_buffer(device->ws, cs, pool->bo);
2739 
2740    emit_query_flush(cmd_buffer, pool);
2741 
2742    va += pool->stride * query;
2743 
2744    if (pool->uses_ace) {
2745       if (!radv_gang_init(cmd_buffer))
2746          return;
2747 
2748       radv_cs_add_buffer(device->ws, cmd_buffer->gang.cs, pool->bo);
2749    }
2750 
2751    if (pool->uses_shader_query_buf)
2752       radv_alloc_shader_query_buf(cmd_buffer);
2753 
2754    emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, flags, index);
2755 }
2756 
2757 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)2758 radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, uint32_t index)
2759 {
2760    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2761    VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2762    uint64_t va = radv_buffer_get_va(pool->bo);
2763    uint64_t avail_va = va + pool->availability_offset + 4 * query;
2764    va += pool->stride * query;
2765 
2766    /* Do not need to add the pool BO to the list because the query must
2767     * currently be active, which means the BO is already in the list.
2768     */
2769    emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, index);
2770 
2771    /*
2772     * For multiview we have to emit a query for each bit in the mask,
2773     * however the first query we emit will get the totals for all the
2774     * operations, so we don't want to get a real value in the other
2775     * queries. This emits a fake begin/end sequence so the waiting
2776     * code gets a completed query value and doesn't hang, but the
2777     * query returns 0.
2778     */
2779    if (cmd_buffer->state.render.view_mask) {
2780       for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.render.view_mask); i++) {
2781          va += pool->stride;
2782          avail_va += 4;
2783          emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, 0, 0);
2784          emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, 0);
2785       }
2786    }
2787 }
2788 
2789 void
radv_write_timestamp(struct radv_cmd_buffer * cmd_buffer,uint64_t va,VkPipelineStageFlags2 stage)2790 radv_write_timestamp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkPipelineStageFlags2 stage)
2791 {
2792    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2793    const struct radv_physical_device *pdev = radv_device_physical(device);
2794    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2795 
2796    if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
2797       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2798       radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
2799                          COPY_DATA_DST_SEL(V_370_MEM));
2800       radeon_emit(cs, 0);
2801       radeon_emit(cs, 0);
2802       radeon_emit(cs, va);
2803       radeon_emit(cs, va >> 32);
2804    } else {
2805       radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
2806                                    EOP_DST_SEL_MEM, EOP_DATA_SEL_TIMESTAMP, va, 0, cmd_buffer->gfx9_eop_bug_va);
2807    }
2808 }
2809 
2810 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)2811 radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkQueryPool queryPool,
2812                         uint32_t query)
2813 {
2814    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2815    VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2816    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2817    const struct radv_physical_device *pdev = radv_device_physical(device);
2818    const struct radv_instance *instance = radv_physical_device_instance(pdev);
2819    const unsigned num_queries = MAX2(util_bitcount(cmd_buffer->state.render.view_mask), 1);
2820    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2821    const uint64_t va = radv_buffer_get_va(pool->bo);
2822    uint64_t query_va = va + pool->stride * query;
2823 
2824    radv_cs_add_buffer(device->ws, cs, pool->bo);
2825 
2826    assert(cmd_buffer->qf != RADV_QUEUE_VIDEO_DEC &&
2827           cmd_buffer->qf != RADV_QUEUE_VIDEO_ENC);
2828 
2829    if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
2830       if (instance->drirc.flush_before_timestamp_write) {
2831          radeon_check_space(device->ws, cmd_buffer->cs, 1);
2832          radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
2833       }
2834 
2835       for (unsigned i = 0; i < num_queries; ++i, query_va += pool->stride) {
2836          radeon_check_space(device->ws, cmd_buffer->cs, 3);
2837          radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
2838          radeon_emit(cs, query_va);
2839          radeon_emit(cs, query_va >> 32);
2840       }
2841       return;
2842    }
2843 
2844    if (instance->drirc.flush_before_timestamp_write) {
2845       /* Make sure previously launched waves have finished */
2846       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
2847    }
2848 
2849    radv_emit_cache_flush(cmd_buffer);
2850 
2851    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 28 * num_queries);
2852 
2853    for (unsigned i = 0; i < num_queries; i++) {
2854       radv_write_timestamp(cmd_buffer, query_va, stage);
2855       query_va += pool->stride;
2856    }
2857 
2858    cmd_buffer->active_query_flush_bits |=
2859       RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2860    if (pdev->info.gfx_level >= GFX9) {
2861       cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2862    }
2863 
2864    assert(cmd_buffer->cs->cdw <= cdw_max);
2865 }
2866 
2867 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,uint32_t accelerationStructureCount,const VkAccelerationStructureKHR * pAccelerationStructures,VkQueryType queryType,VkQueryPool queryPool,uint32_t firstQuery)2868 radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount,
2869                                                  const VkAccelerationStructureKHR *pAccelerationStructures,
2870                                                  VkQueryType queryType, VkQueryPool queryPool, uint32_t firstQuery)
2871 {
2872    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2873    VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2874    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2875    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2876    uint64_t pool_va = radv_buffer_get_va(pool->bo);
2877    uint64_t query_va = pool_va + pool->stride * firstQuery;
2878 
2879    radv_cs_add_buffer(device->ws, cs, pool->bo);
2880 
2881    radv_emit_cache_flush(cmd_buffer);
2882 
2883    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 6 * accelerationStructureCount);
2884 
2885    for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
2886       VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pAccelerationStructures[i]);
2887       uint64_t va = vk_acceleration_structure_get_va(accel_struct);
2888 
2889       switch (queryType) {
2890       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2891          va += offsetof(struct radv_accel_struct_header, compacted_size);
2892          break;
2893       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2894          va += offsetof(struct radv_accel_struct_header, serialization_size);
2895          break;
2896       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2897          va += offsetof(struct radv_accel_struct_header, instance_count);
2898          break;
2899       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2900          va += offsetof(struct radv_accel_struct_header, size);
2901          break;
2902       default:
2903          unreachable("Unhandle accel struct query type.");
2904       }
2905 
2906       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2907       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
2908                          COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
2909       radeon_emit(cs, va);
2910       radeon_emit(cs, va >> 32);
2911       radeon_emit(cs, query_va);
2912       radeon_emit(cs, query_va >> 32);
2913 
2914       query_va += pool->stride;
2915    }
2916 
2917    assert(cmd_buffer->cs->cdw <= cdw_max);
2918 }
2919