• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * Based on anv:
4  * Copyright © 2015 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23  * IN THE SOFTWARE.
24  */
25 
26 #include <assert.h>
27 #include <fcntl.h>
28 #include <stdbool.h>
29 #include <string.h>
30 
31 #include "bvh/bvh.h"
32 #include "meta/radv_meta.h"
33 #include "nir/nir_builder.h"
34 #include "util/u_atomic.h"
35 #include "vulkan/vulkan_core.h"
36 #include "radv_cs.h"
37 #include "radv_private.h"
38 #include "sid.h"
39 #include "vk_acceleration_structure.h"
40 #include "vk_common_entrypoints.h"
41 
42 #define TIMESTAMP_NOT_READY UINT64_MAX
43 
44 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10, 13, 11, 12};
45 
46 static unsigned
radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)47 radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)
48 {
49    uint32_t idx = ffs(query) - 1;
50    return pipeline_statistics_indices[idx] * 8;
51 }
52 
53 static unsigned
radv_get_pipelinestat_query_size(struct radv_device * device)54 radv_get_pipelinestat_query_size(struct radv_device *device)
55 {
56    /* GFX10_3 only has 11 valid pipeline statistics queries but in order to emulate mesh/task shader
57     * invocations, it's easier to use the same size as GFX11.
58     */
59    unsigned num_results = device->physical_device->rad_info.gfx_level >= GFX10_3 ? 14 : 11;
60    return num_results * 8;
61 }
62 
63 static void
radv_store_availability(nir_builder * b,nir_def * flags,nir_def * dst_buf,nir_def * offset,nir_def * value32)64 radv_store_availability(nir_builder *b, nir_def *flags, nir_def *dst_buf, nir_def *offset, nir_def *value32)
65 {
66    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
67 
68    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
69 
70    nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .align_mul = 8);
71 
72    nir_push_else(b, NULL);
73 
74    nir_store_ssbo(b, value32, dst_buf, offset);
75 
76    nir_pop_if(b, NULL);
77 
78    nir_pop_if(b, NULL);
79 }
80 
81 static nir_shader *
build_occlusion_query_shader(struct radv_device * device)82 build_occlusion_query_shader(struct radv_device *device)
83 {
84    /* the shader this builds is roughly
85     *
86     * push constants {
87     * 	uint32_t flags;
88     * 	uint32_t dst_stride;
89     * };
90     *
91     * uint32_t src_stride = 16 * db_count;
92     *
93     * location(binding = 0) buffer dst_buf;
94     * location(binding = 1) buffer src_buf;
95     *
96     * void main() {
97     * 	uint64_t result = 0;
98     * 	uint64_t src_offset = src_stride * global_id.x;
99     * 	uint64_t dst_offset = dst_stride * global_id.x;
100     * 	bool available = true;
101     * 	for (int i = 0; i < db_count; ++i) {
102     *		if (enabled_rb_mask & BITFIELD64_BIT(i)) {
103     *			uint64_t start = src_buf[src_offset + 16 * i];
104     *			uint64_t end = src_buf[src_offset + 16 * i + 8];
105     *			if ((start & (1ull << 63)) && (end & (1ull << 63)))
106     *				result += end - start;
107     *			else
108     *				available = false;
109     *		}
110     * 	}
111     * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
112     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
113     * 		if (flags & VK_QUERY_RESULT_64_BIT)
114     * 			dst_buf[dst_offset] = result;
115     * 		else
116     * 			dst_buf[dst_offset] = (uint32_t)result.
117     * 	}
118     * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
119     * 		dst_buf[dst_offset + elem_size] = available;
120     * 	}
121     * }
122     */
123    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "occlusion_query");
124    b.shader->info.workgroup_size[0] = 64;
125 
126    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
127    nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
128    nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
129    nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
130    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
131    uint64_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
132    unsigned db_count = device->physical_device->rad_info.max_render_backends;
133 
134    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
135 
136    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
137    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
138 
139    nir_def *global_id = get_global_ids(&b, 1);
140 
141    nir_def *input_stride = nir_imm_int(&b, db_count * 16);
142    nir_def *input_base = nir_imul(&b, input_stride, global_id);
143    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
144    nir_def *output_base = nir_imul(&b, output_stride, global_id);
145 
146    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
147    nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
148    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
149 
150    nir_def *query_result_wait = nir_test_mask(&b, flags, VK_QUERY_RESULT_WAIT_BIT);
151    nir_push_if(&b, query_result_wait);
152    {
153       /* Wait on the upper word of the last DB entry. */
154       nir_push_loop(&b);
155       {
156          const uint32_t rb_avail_offset = 16 * util_last_bit64(enabled_rb_mask) - 4;
157 
158          /* Prevent the SSBO load to be moved out of the loop. */
159          nir_scoped_memory_barrier(&b, SCOPE_INVOCATION, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo);
160 
161          nir_def *load_offset = nir_iadd_imm(&b, input_base, rb_avail_offset);
162          nir_def *load = nir_load_ssbo(&b, 1, 32, src_buf, load_offset, .align_mul = 4, .access = ACCESS_COHERENT);
163 
164          nir_push_if(&b, nir_ige_imm(&b, load, 0x80000000));
165          {
166             nir_jump(&b, nir_jump_break);
167          }
168          nir_pop_if(&b, NULL);
169       }
170       nir_pop_loop(&b, NULL);
171    }
172    nir_pop_if(&b, NULL);
173 
174    nir_push_loop(&b);
175 
176    nir_def *current_outer_count = nir_load_var(&b, outer_counter);
177    radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
178 
179    nir_def *enabled_cond = nir_iand_imm(&b, nir_ishl(&b, nir_imm_int64(&b, 1), current_outer_count), enabled_rb_mask);
180 
181    nir_push_if(&b, nir_i2b(&b, enabled_cond));
182 
183    nir_def *load_offset = nir_imul_imm(&b, current_outer_count, 16);
184    load_offset = nir_iadd(&b, input_base, load_offset);
185 
186    nir_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16);
187 
188    nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1);
189    nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1);
190 
191    nir_def *start_done = nir_ilt_imm(&b, nir_load_var(&b, start), 0);
192    nir_def *end_done = nir_ilt_imm(&b, nir_load_var(&b, end), 0);
193 
194    nir_push_if(&b, nir_iand(&b, start_done, end_done));
195 
196    nir_store_var(&b, result,
197                  nir_iadd(&b, nir_load_var(&b, result), nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))),
198                  0x1);
199 
200    nir_push_else(&b, NULL);
201 
202    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
203 
204    nir_pop_if(&b, NULL);
205    nir_pop_if(&b, NULL);
206    nir_pop_loop(&b, NULL);
207 
208    /* Store the result if complete or if partial results have been requested. */
209 
210    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
211    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
212    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
213 
214    nir_push_if(&b, result_is_64bit);
215 
216    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .align_mul = 8);
217 
218    nir_push_else(&b, NULL);
219 
220    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base, .align_mul = 8);
221 
222    nir_pop_if(&b, NULL);
223    nir_pop_if(&b, NULL);
224 
225    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
226                            nir_b2i32(&b, nir_load_var(&b, available)));
227 
228    return b.shader;
229 }
230 
231 static nir_shader *
build_pipeline_statistics_query_shader(struct radv_device * device)232 build_pipeline_statistics_query_shader(struct radv_device *device)
233 {
234    unsigned pipelinestat_block_size = +radv_get_pipelinestat_query_size(device);
235 
236    /* the shader this builds is roughly
237     *
238     * push constants {
239     * 	uint32_t flags;
240     * 	uint32_t dst_stride;
241     * 	uint32_t stats_mask;
242     * 	uint32_t avail_offset;
243     * };
244     *
245     * uint32_t src_stride = pipelinestat_block_size * 2;
246     *
247     * location(binding = 0) buffer dst_buf;
248     * location(binding = 1) buffer src_buf;
249     *
250     * void main() {
251     * 	uint64_t src_offset = src_stride * global_id.x;
252     * 	uint64_t dst_base = dst_stride * global_id.x;
253     * 	uint64_t dst_offset = dst_base;
254     * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
255     * 	uint32_t elem_count = stats_mask >> 16;
256     * 	uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
257     * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
258     * 		dst_buf[dst_offset + elem_count * elem_size] = available32;
259     * 	}
260     * 	if ((bool)available32) {
261     * 		// repeat 11 times:
262     * 		if (stats_mask & (1 << 0)) {
263     * 			uint64_t start = src_buf[src_offset + 8 * indices[0]];
264     * 			uint64_t end = src_buf[src_offset + 8 * indices[0] +
265     * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT)
266     * 				dst_buf[dst_offset] = result;
267     * 			else
268     * 				dst_buf[dst_offset] = (uint32_t)result.
269     * 			dst_offset += elem_size;
270     * 		}
271     * 	} else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
272     *              // Set everything to 0 as we don't know what is valid.
273     * 		for (int i = 0; i < elem_count; ++i)
274     * 			dst_buf[dst_base + elem_size * i] = 0;
275     * 	}
276     * }
277     */
278    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pipeline_statistics_query");
279    b.shader->info.workgroup_size[0] = 64;
280 
281    nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
282    nir_variable *result = nir_local_variable_create(b.impl, glsl_int64_t_type(), "result");
283    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
284 
285    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
286    nir_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12);
287    nir_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
288 
289    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
290    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
291 
292    nir_def *global_id = get_global_ids(&b, 1);
293 
294    nir_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
295    nir_def *input_base = nir_imul(&b, input_stride, global_id);
296    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
297    nir_def *output_base = nir_imul(&b, output_stride, global_id);
298 
299    avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4));
300 
301    nir_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset);
302    nir_store_var(&b, available, nir_i2b(&b, available32), 0x1);
303 
304    if (device->physical_device->emulate_mesh_shader_queries) {
305       nir_push_if(&b, nir_test_mask(&b, stats_mask, VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT));
306       {
307          const uint32_t idx = ffs(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT) - 1;
308 
309          nir_def *avail_start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + 4);
310          nir_def *avail_start = nir_load_ssbo(&b, 1, 32, src_buf, avail_start_offset);
311 
312          nir_def *avail_end_offset =
313             nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + pipelinestat_block_size + 4);
314          nir_def *avail_end = nir_load_ssbo(&b, 1, 32, src_buf, avail_end_offset);
315 
316          nir_def *task_invoc_result_available =
317             nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avail_start, avail_end), 0x80000000));
318 
319          nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), task_invoc_result_available), 0x1);
320       }
321       nir_pop_if(&b, NULL);
322    }
323 
324    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
325    nir_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
326    nir_def *elem_count = nir_ushr_imm(&b, stats_mask, 16);
327 
328    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
329                            nir_b2i32(&b, nir_load_var(&b, available)));
330 
331    nir_push_if(&b, nir_load_var(&b, available));
332 
333    nir_store_var(&b, output_offset, output_base, 0x1);
334    for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
335       nir_push_if(&b, nir_test_mask(&b, stats_mask, BITFIELD64_BIT(i)));
336 
337       nir_def *start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8);
338       nir_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset);
339 
340       nir_def *end_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size);
341       nir_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset);
342 
343       nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
344 
345       /* Store result */
346       nir_push_if(&b, result_is_64bit);
347 
348       nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, nir_load_var(&b, output_offset));
349 
350       nir_push_else(&b, NULL);
351 
352       nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, nir_load_var(&b, output_offset));
353 
354       nir_pop_if(&b, NULL);
355 
356       nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size), 0x1);
357 
358       nir_pop_if(&b, NULL);
359    }
360 
361    nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
362 
363    nir_push_if(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
364 
365    /* Stores zeros in all outputs. */
366 
367    nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
368    nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
369 
370    nir_loop *loop = nir_push_loop(&b);
371 
372    nir_def *current_counter = nir_load_var(&b, counter);
373    radv_break_on_count(&b, counter, elem_count);
374 
375    nir_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter));
376    nir_push_if(&b, result_is_64bit);
377 
378    nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem);
379 
380    nir_push_else(&b, NULL);
381 
382    nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem);
383 
384    nir_pop_if(&b, NULL);
385 
386    nir_pop_loop(&b, loop);
387    nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
388    nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
389    return b.shader;
390 }
391 
392 static nir_shader *
build_tfb_query_shader(struct radv_device * device)393 build_tfb_query_shader(struct radv_device *device)
394 {
395    /* the shader this builds is roughly
396     *
397     * uint32_t src_stride = 32;
398     *
399     * location(binding = 0) buffer dst_buf;
400     * location(binding = 1) buffer src_buf;
401     *
402     * void main() {
403     *	uint64_t result[2] = {};
404     *	bool available = false;
405     *	uint64_t src_offset = src_stride * global_id.x;
406     * 	uint64_t dst_offset = dst_stride * global_id.x;
407     * 	uint64_t *src_data = src_buf[src_offset];
408     *	uint32_t avail = (src_data[0] >> 32) &
409     *			 (src_data[1] >> 32) &
410     *			 (src_data[2] >> 32) &
411     *			 (src_data[3] >> 32);
412     *	if (avail & 0x80000000) {
413     *		result[0] = src_data[3] - src_data[1];
414     *		result[1] = src_data[2] - src_data[0];
415     *		available = true;
416     *	}
417     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
418     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
419     *		if (flags & VK_QUERY_RESULT_64_BIT) {
420     *			dst_buf[dst_offset] = result;
421     *		} else {
422     *			dst_buf[dst_offset] = (uint32_t)result;
423     *		}
424     *	}
425     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
426     *		dst_buf[dst_offset + result_size] = available;
427     * 	}
428     * }
429     */
430    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "tfb_query");
431    b.shader->info.workgroup_size[0] = 64;
432 
433    /* Create and initialize local variables. */
434    nir_variable *result = nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result");
435    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
436 
437    nir_store_var(&b, result, nir_replicate(&b, nir_imm_int64(&b, 0), 2), 0x3);
438    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
439 
440    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
441 
442    /* Load resources. */
443    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
444    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
445 
446    /* Compute global ID. */
447    nir_def *global_id = get_global_ids(&b, 1);
448 
449    /* Compute src/dst strides. */
450    nir_def *input_stride = nir_imm_int(&b, 32);
451    nir_def *input_base = nir_imul(&b, input_stride, global_id);
452    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
453    nir_def *output_base = nir_imul(&b, output_stride, global_id);
454 
455    /* Load data from the query pool. */
456    nir_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32);
457    nir_def *load2 = nir_load_ssbo(&b, 4, 32, src_buf, nir_iadd_imm(&b, input_base, 16), .align_mul = 16);
458 
459    /* Check if result is available. */
460    nir_def *avails[2];
461    avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3));
462    avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3));
463    nir_def *result_is_available = nir_test_mask(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000);
464 
465    /* Only compute result if available. */
466    nir_push_if(&b, result_is_available);
467 
468    /* Pack values. */
469    nir_def *packed64[4];
470    packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
471    packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3)));
472    packed64[2] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
473    packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3)));
474 
475    /* Compute result. */
476    nir_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]);
477    nir_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]);
478 
479    nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3);
480    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
481 
482    nir_pop_if(&b, NULL);
483 
484    /* Determine if result is 64 or 32 bit. */
485    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
486    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
487 
488    /* Store the result if complete or partial results have been requested. */
489    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
490 
491    /* Store result. */
492    nir_push_if(&b, result_is_64bit);
493 
494    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
495 
496    nir_push_else(&b, NULL);
497 
498    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
499 
500    nir_pop_if(&b, NULL);
501    nir_pop_if(&b, NULL);
502 
503    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
504                            nir_b2i32(&b, nir_load_var(&b, available)));
505 
506    return b.shader;
507 }
508 
509 static nir_shader *
build_timestamp_query_shader(struct radv_device * device)510 build_timestamp_query_shader(struct radv_device *device)
511 {
512    /* the shader this builds is roughly
513     *
514     * uint32_t src_stride = 8;
515     *
516     * location(binding = 0) buffer dst_buf;
517     * location(binding = 1) buffer src_buf;
518     *
519     * void main() {
520     *	uint64_t result = 0;
521     *	bool available = false;
522     *	uint64_t src_offset = src_stride * global_id.x;
523     * 	uint64_t dst_offset = dst_stride * global_id.x;
524     * 	uint64_t timestamp = src_buf[src_offset];
525     *	if (timestamp != TIMESTAMP_NOT_READY) {
526     *		result = timestamp;
527     *		available = true;
528     *	}
529     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
530     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
531     *		if (flags & VK_QUERY_RESULT_64_BIT) {
532     *			dst_buf[dst_offset] = result;
533     *		} else {
534     *			dst_buf[dst_offset] = (uint32_t)result;
535     *		}
536     *	}
537     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
538     *		dst_buf[dst_offset + result_size] = available;
539     * 	}
540     * }
541     */
542    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "timestamp_query");
543    b.shader->info.workgroup_size[0] = 64;
544 
545    /* Create and initialize local variables. */
546    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
547    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
548 
549    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
550    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
551 
552    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
553 
554    /* Load resources. */
555    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
556    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
557 
558    /* Compute global ID. */
559    nir_def *global_id = get_global_ids(&b, 1);
560 
561    /* Compute src/dst strides. */
562    nir_def *input_stride = nir_imm_int(&b, 8);
563    nir_def *input_base = nir_imul(&b, input_stride, global_id);
564    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
565    nir_def *output_base = nir_imul(&b, output_stride, global_id);
566 
567    /* Load data from the query pool. */
568    nir_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8);
569 
570    /* Pack the timestamp. */
571    nir_def *timestamp;
572    timestamp = nir_pack_64_2x32(&b, nir_trim_vector(&b, load, 2));
573 
574    /* Check if result is available. */
575    nir_def *result_is_available = nir_i2b(&b, nir_ine_imm(&b, timestamp, TIMESTAMP_NOT_READY));
576 
577    /* Only store result if available. */
578    nir_push_if(&b, result_is_available);
579 
580    nir_store_var(&b, result, timestamp, 0x1);
581    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
582 
583    nir_pop_if(&b, NULL);
584 
585    /* Determine if result is 64 or 32 bit. */
586    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
587    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
588 
589    /* Store the result if complete or partial results have been requested. */
590    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
591 
592    /* Store result. */
593    nir_push_if(&b, result_is_64bit);
594 
595    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
596 
597    nir_push_else(&b, NULL);
598 
599    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
600 
601    nir_pop_if(&b, NULL);
602 
603    nir_pop_if(&b, NULL);
604 
605    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
606                            nir_b2i32(&b, nir_load_var(&b, available)));
607 
608    return b.shader;
609 }
610 
611 #define RADV_PGQ_STRIDE     32
612 #define RADV_PGQ_STRIDE_GDS (RADV_PGQ_STRIDE + 8 * 2)
613 
614 static nir_shader *
build_pg_query_shader(struct radv_device * device)615 build_pg_query_shader(struct radv_device *device)
616 {
617    /* the shader this builds is roughly
618     *
619     * uint32_t src_stride = 32;
620     *
621     * location(binding = 0) buffer dst_buf;
622     * location(binding = 1) buffer src_buf;
623     *
624     * void main() {
625     *	uint64_t result = {};
626     *	bool available = false;
627     *	uint64_t src_offset = src_stride * global_id.x;
628     * 	uint64_t dst_offset = dst_stride * global_id.x;
629     * 	uint64_t *src_data = src_buf[src_offset];
630     *	uint32_t avail = (src_data[0] >> 32) &
631     *			 (src_data[2] >> 32);
632     *	if (avail & 0x80000000) {
633     *		result = src_data[2] - src_data[0];
634     *	        if (use_gds) {
635     *			uint32_t ngg_gds_result = 0;
636     *			ngg_gds_result += src_data[9] - src_data[8];
637     *			result += (uint64_t)ngg_gds_result;
638     *	        }
639     *		available = true;
640     *	}
641     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
642     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
643     *		if (flags & VK_QUERY_RESULT_64_BIT) {
644     *			dst_buf[dst_offset] = result;
645     *		} else {
646     *			dst_buf[dst_offset] = (uint32_t)result;
647     *		}
648     *	}
649     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
650     *		dst_buf[dst_offset + result_size] = available;
651     * 	}
652     * }
653     */
654    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pg_query");
655    b.shader->info.workgroup_size[0] = 64;
656 
657    /* Create and initialize local variables. */
658    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
659    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
660 
661    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
662    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
663 
664    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
665 
666    /* Load resources. */
667    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
668    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
669 
670    /* Compute global ID. */
671    nir_def *global_id = get_global_ids(&b, 1);
672 
673    /* Determine if the query pool uses GDS for NGG. */
674    nir_def *uses_gds = nir_i2b(&b, nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20));
675 
676    /* Compute src/dst strides. */
677    nir_def *input_stride =
678       nir_bcsel(&b, uses_gds, nir_imm_int(&b, RADV_PGQ_STRIDE_GDS), nir_imm_int(&b, RADV_PGQ_STRIDE));
679    nir_def *input_base = nir_imul(&b, input_stride, global_id);
680    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
681    nir_def *output_base = nir_imul(&b, output_stride, global_id);
682 
683    /* Load data from the query pool. */
684    nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
685    nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16);
686 
687    /* Check if result is available. */
688    nir_def *avails[2];
689    avails[0] = nir_channel(&b, load1, 1);
690    avails[1] = nir_channel(&b, load2, 1);
691    nir_store_var(&b, available, nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000)), 0x1);
692 
693    nir_push_if(&b, uses_gds);
694    {
695       nir_def *gds_avail_start = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 36), .align_mul = 4);
696       nir_def *gds_avail_end = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 44), .align_mul = 4);
697       nir_def *gds_result_available =
698          nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, gds_avail_start, gds_avail_end), 0x80000000));
699 
700       nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), gds_result_available), 0x1);
701    }
702    nir_pop_if(&b, NULL);
703 
704    /* Only compute result if available. */
705    nir_push_if(&b, nir_load_var(&b, available));
706 
707    /* Pack values. */
708    nir_def *packed64[2];
709    packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
710    packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
711 
712    /* Compute result. */
713    nir_def *primitive_storage_needed = nir_isub(&b, packed64[1], packed64[0]);
714 
715    nir_store_var(&b, result, primitive_storage_needed, 0x1);
716 
717    nir_push_if(&b, uses_gds);
718    {
719       nir_def *gds_start =
720          nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 4);
721       nir_def *gds_end =
722          nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 4);
723 
724       nir_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
725 
726       nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), nir_u2u64(&b, ngg_gds_result)), 0x1);
727    }
728    nir_pop_if(&b, NULL);
729 
730    nir_pop_if(&b, NULL);
731 
732    /* Determine if result is 64 or 32 bit. */
733    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
734    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
735 
736    /* Store the result if complete or partial results have been requested. */
737    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
738 
739    /* Store result. */
740    nir_push_if(&b, result_is_64bit);
741 
742    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
743 
744    nir_push_else(&b, NULL);
745 
746    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
747 
748    nir_pop_if(&b, NULL);
749    nir_pop_if(&b, NULL);
750 
751    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
752                            nir_b2i32(&b, nir_load_var(&b, available)));
753 
754    return b.shader;
755 }
756 
757 static nir_shader *
build_ms_prim_gen_query_shader(struct radv_device * device)758 build_ms_prim_gen_query_shader(struct radv_device *device)
759 {
760    /* the shader this builds is roughly
761     *
762     * uint32_t src_stride = 32;
763     *
764     * location(binding = 0) buffer dst_buf;
765     * location(binding = 1) buffer src_buf;
766     *
767     * void main() {
768     *	uint64_t result = {};
769     *	bool available = false;
770     *	uint64_t src_offset = src_stride * global_id.x;
771     * 	uint64_t dst_offset = dst_stride * global_id.x;
772     * 	uint64_t *src_data = src_buf[src_offset];
773     *	uint32_t avail = (src_data[0] >> 32) & (src_data[1] >> 32);
774     *	if (avail & 0x80000000) {
775     *		result = src_data[1] - src_data[0];
776     *		available = true;
777     *	}
778     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
779     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
780     *		if (flags & VK_QUERY_RESULT_64_BIT) {
781     *			dst_buf[dst_offset] = result;
782     *		} else {
783     *			dst_buf[dst_offset] = (uint32_t)result;
784     *		}
785     *	}
786     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
787     *		dst_buf[dst_offset + result_size] = available;
788     * 	}
789     * }
790     */
791    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "ms_prim_gen_query");
792    b.shader->info.workgroup_size[0] = 64;
793 
794    /* Create and initialize local variables. */
795    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
796    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
797 
798    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
799    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
800 
801    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
802 
803    /* Load resources. */
804    nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
805    nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
806 
807    /* Compute global ID. */
808    nir_def *global_id = get_global_ids(&b, 1);
809 
810    /* Compute src/dst strides. */
811    nir_def *input_base = nir_imul_imm(&b, global_id, 16);
812    nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
813    nir_def *output_base = nir_imul(&b, output_stride, global_id);
814 
815    /* Load data from the query pool. */
816    nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
817    nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd_imm(&b, input_base, 8), .align_mul = 16);
818 
819    /* Check if result is available. */
820    nir_def *avails[2];
821    avails[0] = nir_channel(&b, load1, 1);
822    avails[1] = nir_channel(&b, load2, 1);
823    nir_def *result_is_available = nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000));
824 
825    /* Only compute result if available. */
826    nir_push_if(&b, result_is_available);
827 
828    /* Pack values. */
829    nir_def *packed64[2];
830    packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
831    packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
832 
833    /* Compute result. */
834    nir_def *ms_prim_gen = nir_isub(&b, packed64[1], packed64[0]);
835 
836    nir_store_var(&b, result, ms_prim_gen, 0x1);
837 
838    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
839 
840    nir_pop_if(&b, NULL);
841 
842    /* Determine if result is 64 or 32 bit. */
843    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
844    nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
845 
846    /* Store the result if complete or partial results have been requested. */
847    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
848 
849    /* Store result. */
850    nir_push_if(&b, result_is_64bit);
851 
852    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
853 
854    nir_push_else(&b, NULL);
855 
856    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
857 
858    nir_pop_if(&b, NULL);
859    nir_pop_if(&b, NULL);
860 
861    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
862                            nir_b2i32(&b, nir_load_var(&b, available)));
863 
864    return b.shader;
865 }
866 
867 static VkResult
radv_device_init_meta_query_state_internal(struct radv_device * device)868 radv_device_init_meta_query_state_internal(struct radv_device *device)
869 {
870    VkResult result;
871    nir_shader *occlusion_cs = NULL;
872    nir_shader *pipeline_statistics_cs = NULL;
873    nir_shader *tfb_cs = NULL;
874    nir_shader *timestamp_cs = NULL;
875    nir_shader *pg_cs = NULL;
876    nir_shader *ms_prim_gen_cs = NULL;
877 
878    mtx_lock(&device->meta_state.mtx);
879    if (device->meta_state.query.pipeline_statistics_query_pipeline) {
880       mtx_unlock(&device->meta_state.mtx);
881       return VK_SUCCESS;
882    }
883    occlusion_cs = build_occlusion_query_shader(device);
884    pipeline_statistics_cs = build_pipeline_statistics_query_shader(device);
885    tfb_cs = build_tfb_query_shader(device);
886    timestamp_cs = build_timestamp_query_shader(device);
887    pg_cs = build_pg_query_shader(device);
888 
889    if (device->physical_device->emulate_mesh_shader_queries)
890       ms_prim_gen_cs = build_ms_prim_gen_query_shader(device);
891 
892    VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
893       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
894       .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
895       .bindingCount = 2,
896       .pBindings = (VkDescriptorSetLayoutBinding[]){
897          {.binding = 0,
898           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
899           .descriptorCount = 1,
900           .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
901           .pImmutableSamplers = NULL},
902          {.binding = 1,
903           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
904           .descriptorCount = 1,
905           .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
906           .pImmutableSamplers = NULL},
907       }};
908 
909    result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &occlusion_ds_create_info,
910                                            &device->meta_state.alloc, &device->meta_state.query.ds_layout);
911    if (result != VK_SUCCESS)
912       goto fail;
913 
914    VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
915       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
916       .setLayoutCount = 1,
917       .pSetLayouts = &device->meta_state.query.ds_layout,
918       .pushConstantRangeCount = 1,
919       .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 20},
920    };
921 
922    result = radv_CreatePipelineLayout(radv_device_to_handle(device), &occlusion_pl_create_info,
923                                       &device->meta_state.alloc, &device->meta_state.query.p_layout);
924    if (result != VK_SUCCESS)
925       goto fail;
926 
927    VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
928       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
929       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
930       .module = vk_shader_module_handle_from_nir(occlusion_cs),
931       .pName = "main",
932       .pSpecializationInfo = NULL,
933    };
934 
935    VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
936       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
937       .stage = occlusion_pipeline_shader_stage,
938       .flags = 0,
939       .layout = device->meta_state.query.p_layout,
940    };
941 
942    result =
943       radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, &occlusion_vk_pipeline_info,
944                                    NULL, &device->meta_state.query.occlusion_query_pipeline);
945    if (result != VK_SUCCESS)
946       goto fail;
947 
948    VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
949       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
950       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
951       .module = vk_shader_module_handle_from_nir(pipeline_statistics_cs),
952       .pName = "main",
953       .pSpecializationInfo = NULL,
954    };
955 
956    VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
957       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
958       .stage = pipeline_statistics_pipeline_shader_stage,
959       .flags = 0,
960       .layout = device->meta_state.query.p_layout,
961    };
962 
963    result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache,
964                                          &pipeline_statistics_vk_pipeline_info, NULL,
965                                          &device->meta_state.query.pipeline_statistics_query_pipeline);
966    if (result != VK_SUCCESS)
967       goto fail;
968 
969    VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
970       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
971       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
972       .module = vk_shader_module_handle_from_nir(tfb_cs),
973       .pName = "main",
974       .pSpecializationInfo = NULL,
975    };
976 
977    VkComputePipelineCreateInfo tfb_pipeline_info = {
978       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
979       .stage = tfb_pipeline_shader_stage,
980       .flags = 0,
981       .layout = device->meta_state.query.p_layout,
982    };
983 
984    result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, &tfb_pipeline_info,
985                                          NULL, &device->meta_state.query.tfb_query_pipeline);
986    if (result != VK_SUCCESS)
987       goto fail;
988 
989    VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = {
990       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
991       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
992       .module = vk_shader_module_handle_from_nir(timestamp_cs),
993       .pName = "main",
994       .pSpecializationInfo = NULL,
995    };
996 
997    VkComputePipelineCreateInfo timestamp_pipeline_info = {
998       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
999       .stage = timestamp_pipeline_shader_stage,
1000       .flags = 0,
1001       .layout = device->meta_state.query.p_layout,
1002    };
1003 
1004    result =
1005       radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, &timestamp_pipeline_info,
1006                                    NULL, &device->meta_state.query.timestamp_query_pipeline);
1007    if (result != VK_SUCCESS)
1008       goto fail;
1009 
1010    VkPipelineShaderStageCreateInfo pg_pipeline_shader_stage = {
1011       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1012       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1013       .module = vk_shader_module_handle_from_nir(pg_cs),
1014       .pName = "main",
1015       .pSpecializationInfo = NULL,
1016    };
1017 
1018    VkComputePipelineCreateInfo pg_pipeline_info = {
1019       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1020       .stage = pg_pipeline_shader_stage,
1021       .flags = 0,
1022       .layout = device->meta_state.query.p_layout,
1023    };
1024 
1025    result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, &pg_pipeline_info,
1026                                          NULL, &device->meta_state.query.pg_query_pipeline);
1027 
1028    if (device->physical_device->emulate_mesh_shader_queries) {
1029       VkPipelineShaderStageCreateInfo ms_prim_gen_pipeline_shader_stage = {
1030          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1031          .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1032          .module = vk_shader_module_handle_from_nir(ms_prim_gen_cs),
1033          .pName = "main",
1034          .pSpecializationInfo = NULL,
1035       };
1036 
1037       VkComputePipelineCreateInfo ms_prim_gen_pipeline_info = {
1038          .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1039          .stage = ms_prim_gen_pipeline_shader_stage,
1040          .flags = 0,
1041          .layout = device->meta_state.query.p_layout,
1042       };
1043 
1044       result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache,
1045                                             &ms_prim_gen_pipeline_info, NULL,
1046                                             &device->meta_state.query.ms_prim_gen_query_pipeline);
1047    }
1048 
1049 fail:
1050    ralloc_free(occlusion_cs);
1051    ralloc_free(pipeline_statistics_cs);
1052    ralloc_free(tfb_cs);
1053    ralloc_free(pg_cs);
1054    ralloc_free(ms_prim_gen_cs);
1055    ralloc_free(timestamp_cs);
1056    mtx_unlock(&device->meta_state.mtx);
1057    return result;
1058 }
1059 
1060 VkResult
radv_device_init_meta_query_state(struct radv_device * device,bool on_demand)1061 radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
1062 {
1063    if (on_demand)
1064       return VK_SUCCESS;
1065 
1066    return radv_device_init_meta_query_state_internal(device);
1067 }
1068 
1069 void
radv_device_finish_meta_query_state(struct radv_device * device)1070 radv_device_finish_meta_query_state(struct radv_device *device)
1071 {
1072    if (device->meta_state.query.tfb_query_pipeline)
1073       radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.tfb_query_pipeline,
1074                            &device->meta_state.alloc);
1075 
1076    if (device->meta_state.query.pipeline_statistics_query_pipeline)
1077       radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.pipeline_statistics_query_pipeline,
1078                            &device->meta_state.alloc);
1079 
1080    if (device->meta_state.query.occlusion_query_pipeline)
1081       radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.occlusion_query_pipeline,
1082                            &device->meta_state.alloc);
1083 
1084    if (device->meta_state.query.timestamp_query_pipeline)
1085       radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.timestamp_query_pipeline,
1086                            &device->meta_state.alloc);
1087 
1088    if (device->meta_state.query.pg_query_pipeline)
1089       radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.pg_query_pipeline,
1090                            &device->meta_state.alloc);
1091 
1092    if (device->meta_state.query.ms_prim_gen_query_pipeline)
1093       radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.ms_prim_gen_query_pipeline,
1094                            &device->meta_state.alloc);
1095 
1096    if (device->meta_state.query.p_layout)
1097       radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.query.p_layout,
1098                                  &device->meta_state.alloc);
1099 
1100    if (device->meta_state.query.ds_layout)
1101       device->vk.dispatch_table.DestroyDescriptorSetLayout(
1102          radv_device_to_handle(device), device->meta_state.query.ds_layout, &device->meta_state.alloc);
1103 }
1104 
1105 static void
radv_query_shader(struct radv_cmd_buffer * cmd_buffer,VkPipeline * pipeline,struct radeon_winsys_bo * src_bo,struct radeon_winsys_bo * dst_bo,uint64_t src_offset,uint64_t dst_offset,uint32_t src_stride,uint32_t dst_stride,size_t dst_size,uint32_t count,uint32_t flags,uint32_t pipeline_stats_mask,uint32_t avail_offset,bool uses_gds)1106 radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline, struct radeon_winsys_bo *src_bo,
1107                   struct radeon_winsys_bo *dst_bo, uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride,
1108                   uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags, uint32_t pipeline_stats_mask,
1109                   uint32_t avail_offset, bool uses_gds)
1110 {
1111    struct radv_device *device = cmd_buffer->device;
1112    struct radv_meta_saved_state saved_state;
1113    struct radv_buffer src_buffer, dst_buffer;
1114 
1115    if (!*pipeline) {
1116       VkResult ret = radv_device_init_meta_query_state_internal(device);
1117       if (ret != VK_SUCCESS) {
1118          vk_command_buffer_set_error(&cmd_buffer->vk, ret);
1119          return;
1120       }
1121    }
1122 
1123    /* VK_EXT_conditional_rendering says that copy commands should not be
1124     * affected by conditional rendering.
1125     */
1126    radv_meta_save(&saved_state, cmd_buffer,
1127                   RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS |
1128                      RADV_META_SUSPEND_PREDICATING);
1129 
1130    uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset);
1131    uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size;
1132 
1133    radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset);
1134    radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset);
1135 
1136    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1137 
1138    radv_meta_push_descriptor_set(
1139       cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.query.p_layout, 0, /* set */
1140       2,                                                                                /* descriptorWriteCount */
1141       (VkWriteDescriptorSet[]){{.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1142                                 .dstBinding = 0,
1143                                 .dstArrayElement = 0,
1144                                 .descriptorCount = 1,
1145                                 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1146                                 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer),
1147                                                                          .offset = 0,
1148                                                                          .range = VK_WHOLE_SIZE}},
1149                                {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1150                                 .dstBinding = 1,
1151                                 .dstArrayElement = 0,
1152                                 .descriptorCount = 1,
1153                                 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1154                                 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer),
1155                                                                          .offset = 0,
1156                                                                          .range = VK_WHOLE_SIZE}}});
1157 
1158    /* Encode the number of elements for easy access by the shader. */
1159    pipeline_stats_mask &= (1 << (radv_get_pipelinestat_query_size(device) / 8)) - 1;
1160    pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1161 
1162    avail_offset -= src_offset;
1163 
1164    struct {
1165       uint32_t flags;
1166       uint32_t dst_stride;
1167       uint32_t pipeline_stats_mask;
1168       uint32_t avail_offset;
1169       uint32_t uses_gds;
1170    } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset, uses_gds};
1171 
1172    vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.query.p_layout,
1173                               VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
1174 
1175    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1176 
1177    if (flags & VK_QUERY_RESULT_WAIT_BIT)
1178       cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1179 
1180    radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1181 
1182    /* Ensure that the query copy dispatch is complete before a potential vkCmdResetPool because
1183     * there is an implicit execution dependency from each such query command to all query commands
1184     * previously submitted to the same queue.
1185     */
1186    cmd_buffer->active_query_flush_bits |=
1187       RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1188 
1189    radv_buffer_finish(&src_buffer);
1190    radv_buffer_finish(&dst_buffer);
1191 
1192    radv_meta_restore(&saved_state, cmd_buffer);
1193 }
1194 
1195 static void
radv_destroy_query_pool(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_query_pool * pool)1196 radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
1197                         struct radv_query_pool *pool)
1198 {
1199    if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
1200       radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool);
1201 
1202    if (pool->bo) {
1203       radv_rmv_log_bo_destroy(device, pool->bo);
1204       device->ws->buffer_destroy(device->ws, pool->bo);
1205    }
1206 
1207    radv_rmv_log_resource_destroy(device, (uint64_t)radv_query_pool_to_handle(pool));
1208    vk_query_pool_finish(&pool->vk);
1209    vk_free2(&device->vk.alloc, pAllocator, pool);
1210 }
1211 
1212 VkResult
radv_create_query_pool(struct radv_device * device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool,bool is_internal)1213 radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo *pCreateInfo,
1214                        const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool, bool is_internal)
1215 {
1216    VkResult result;
1217    size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
1218                                 ? sizeof(struct radv_pc_query_pool)
1219                                 : sizeof(struct radv_query_pool);
1220 
1221    struct radv_query_pool *pool =
1222       vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1223 
1224    if (!pool)
1225       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1226 
1227    vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo);
1228 
1229    /* The number of primitives generated by geometry shader invocations is only counted by the
1230     * hardware if GS uses the legacy path. When NGG GS is used, the hardware can't know the number
1231     * of generated primitives and we have to increment it from the shader using a plain GDS atomic.
1232     *
1233     * The number of geometry shader invocations is correctly counted by the hardware for both NGG
1234     * and the legacy GS path but it increments for NGG VS/TES because they are merged with GS. To
1235     * avoid this counter to increment, it's also emulated.
1236     */
1237    pool->uses_gds =
1238       (device->physical_device->emulate_ngg_gs_query_pipeline_stat &&
1239        (pool->vk.pipeline_statistics & (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
1240                                         VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT))) ||
1241       (device->physical_device->use_ngg && pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) ||
1242       (device->physical_device->emulate_mesh_shader_queries &&
1243        (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT ||
1244         pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT));
1245 
1246    /* The number of task shader invocations needs to be queried on ACE. */
1247    pool->uses_ace = (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1248 
1249    switch (pCreateInfo->queryType) {
1250    case VK_QUERY_TYPE_OCCLUSION:
1251       pool->stride = 16 * device->physical_device->rad_info.max_render_backends;
1252       break;
1253    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1254       pool->stride = radv_get_pipelinestat_query_size(device) * 2;
1255       break;
1256    case VK_QUERY_TYPE_TIMESTAMP:
1257    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1258    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1259    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1260    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1261       pool->stride = 8;
1262       break;
1263    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1264       pool->stride = 32;
1265       break;
1266    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1267       if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) {
1268          /* When the hardware can use both the legacy and the NGG paths in the same begin/end pair,
1269           * allocate 2x64-bit values for the GDS counters.
1270           */
1271          pool->stride = RADV_PGQ_STRIDE_GDS;
1272       } else {
1273          pool->stride = RADV_PGQ_STRIDE;
1274       }
1275       break;
1276    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1277       result = radv_pc_init_query_pool(device->physical_device, pCreateInfo, (struct radv_pc_query_pool *)pool);
1278 
1279       if (result != VK_SUCCESS) {
1280          radv_destroy_query_pool(device, pAllocator, pool);
1281          return vk_error(device, result);
1282       }
1283       break;
1284    }
1285    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1286       if (device->physical_device->rad_info.gfx_level >= GFX11) {
1287          /* GFX11 natively supports mesh generated primitives with pipeline statistics. */
1288          pool->stride = radv_get_pipelinestat_query_size(device) * 2;
1289       } else {
1290          assert(device->physical_device->emulate_mesh_shader_queries);
1291          pool->stride = 16;
1292       }
1293       break;
1294    default:
1295       unreachable("creating unhandled query type");
1296    }
1297 
1298    pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1299    pool->size = pool->availability_offset;
1300    if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
1301        (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT &&
1302         device->physical_device->rad_info.gfx_level >= GFX11))
1303       pool->size += 4 * pCreateInfo->queryCount;
1304 
1305    result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT,
1306                                       RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo);
1307    if (result != VK_SUCCESS) {
1308       radv_destroy_query_pool(device, pAllocator, pool);
1309       return vk_error(device, result);
1310    }
1311 
1312    pool->ptr = device->ws->buffer_map(pool->bo);
1313    if (!pool->ptr) {
1314       radv_destroy_query_pool(device, pAllocator, pool);
1315       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1316    }
1317 
1318    *pQueryPool = radv_query_pool_to_handle(pool);
1319    radv_rmv_log_query_pool_create(device, *pQueryPool, is_internal);
1320    return VK_SUCCESS;
1321 }
1322 
1323 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)1324 radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
1325                      const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
1326 {
1327    RADV_FROM_HANDLE(radv_device, device, _device);
1328    return radv_create_query_pool(device, pCreateInfo, pAllocator, pQueryPool, false);
1329 }
1330 
1331 VKAPI_ATTR void VKAPI_CALL
radv_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)1332 radv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator)
1333 {
1334    RADV_FROM_HANDLE(radv_device, device, _device);
1335    RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
1336 
1337    if (!pool)
1338       return;
1339 
1340    radv_destroy_query_pool(device, pAllocator, pool);
1341 }
1342 
1343 static inline uint64_t
radv_get_rel_timeout_for_query(VkQueryType type)1344 radv_get_rel_timeout_for_query(VkQueryType type)
1345 {
1346    /*
1347     * Certain queries are only possible on certain types of queues
1348     * so pick the TDR timeout of the highest possible type
1349     * and double it to ensure GetQueryPoolResults completes in finite-time.
1350     *
1351     * (compute has longer TDR than gfx, other rings)
1352     */
1353    switch (type) {
1354    case VK_QUERY_TYPE_OCCLUSION:
1355    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1356    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1357    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1358       return radv_get_tdr_timeout_for_ip(AMD_IP_GFX) * 2;
1359    default:
1360       return radv_get_tdr_timeout_for_ip(AMD_IP_COMPUTE) * 2;
1361    }
1362 }
1363 
1364 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)1365 radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount,
1366                          size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags)
1367 {
1368    RADV_FROM_HANDLE(radv_device, device, _device);
1369    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1370    char *data = pData;
1371    VkResult result = VK_SUCCESS;
1372 
1373    if (vk_device_is_lost(&device->vk))
1374       return VK_ERROR_DEVICE_LOST;
1375 
1376    for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) {
1377       char *dest = data;
1378       unsigned query = firstQuery + query_idx;
1379       char *src = pool->ptr + query * pool->stride;
1380       uint32_t available;
1381       bool timed_out = false;
1382       uint64_t atimeout = os_time_get_absolute_timeout(radv_get_rel_timeout_for_query(pool->vk.query_type));
1383 
1384       switch (pool->vk.query_type) {
1385       case VK_QUERY_TYPE_TIMESTAMP:
1386       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1387       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1388       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1389       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: {
1390          p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1391          uint64_t value;
1392 
1393          do {
1394             value = p_atomic_read(&src64->value);
1395          } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1396                   !(timed_out = (atimeout < os_time_get_nano())));
1397 
1398          available = value != TIMESTAMP_NOT_READY;
1399 
1400          if (timed_out)
1401             result = VK_ERROR_DEVICE_LOST;
1402          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1403             result = VK_NOT_READY;
1404 
1405          if (flags & VK_QUERY_RESULT_64_BIT) {
1406             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1407                *(uint64_t *)dest = value;
1408             dest += 8;
1409          } else {
1410             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1411                *(uint32_t *)dest = (uint32_t)value;
1412             dest += 4;
1413          }
1414          break;
1415       }
1416       case VK_QUERY_TYPE_OCCLUSION: {
1417          p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1418          uint32_t db_count = device->physical_device->rad_info.max_render_backends;
1419          uint64_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
1420          uint64_t sample_count = 0;
1421          available = 1;
1422 
1423          for (int i = 0; i < db_count; ++i) {
1424             uint64_t start, end;
1425 
1426             if (!(enabled_rb_mask & (1ull << i)))
1427                continue;
1428 
1429             do {
1430                start = p_atomic_read(&src64[2 * i].value);
1431                end = p_atomic_read(&src64[2 * i + 1].value);
1432             } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1433                      !(timed_out = (atimeout < os_time_get_nano())));
1434 
1435             if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1436                available = 0;
1437             else {
1438                sample_count += end - start;
1439             }
1440          }
1441 
1442          if (timed_out)
1443             result = VK_ERROR_DEVICE_LOST;
1444          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1445             result = VK_NOT_READY;
1446 
1447          if (flags & VK_QUERY_RESULT_64_BIT) {
1448             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1449                *(uint64_t *)dest = sample_count;
1450             dest += 8;
1451          } else {
1452             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1453                *(uint32_t *)dest = sample_count;
1454             dest += 4;
1455          }
1456          break;
1457       }
1458       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1459          unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1460          const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
1461 
1462          do {
1463             available = p_atomic_read(avail_ptr);
1464 
1465             if (pool->uses_ace && device->physical_device->emulate_mesh_shader_queries) {
1466                const uint32_t task_invoc_offset =
1467                   radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1468                const uint32_t *avail_ptr_start = (const uint32_t *)(src + task_invoc_offset + 4);
1469                const uint32_t *avail_ptr_stop =
1470                   (const uint32_t *)(src + pipelinestat_block_size + task_invoc_offset + 4);
1471 
1472                if (!(p_atomic_read(avail_ptr_start) & 0x80000000) || !(p_atomic_read(avail_ptr_stop) & 0x80000000))
1473                   available = 0;
1474             }
1475          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1476 
1477          if (timed_out)
1478             result = VK_ERROR_DEVICE_LOST;
1479          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1480             result = VK_NOT_READY;
1481 
1482          const uint64_t *start = (uint64_t *)src;
1483          const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
1484          if (flags & VK_QUERY_RESULT_64_BIT) {
1485             uint64_t *dst = (uint64_t *)dest;
1486             dest += util_bitcount(pool->vk.pipeline_statistics) * 8;
1487             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1488                if (pool->vk.pipeline_statistics & (1u << i)) {
1489                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1490                      *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
1491                   }
1492                   dst++;
1493                }
1494             }
1495 
1496          } else {
1497             uint32_t *dst = (uint32_t *)dest;
1498             dest += util_bitcount(pool->vk.pipeline_statistics) * 4;
1499             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1500                if (pool->vk.pipeline_statistics & (1u << i)) {
1501                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1502                      *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
1503                   }
1504                   dst++;
1505                }
1506             }
1507          }
1508          break;
1509       }
1510       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1511          p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1512          uint64_t num_primitives_written;
1513          uint64_t primitive_storage_needed;
1514 
1515          /* SAMPLE_STREAMOUTSTATS stores this structure:
1516           * {
1517           *	u64 NumPrimitivesWritten;
1518           *	u64 PrimitiveStorageNeeded;
1519           * }
1520           */
1521          do {
1522             available = 1;
1523             for (int j = 0; j < 4; j++) {
1524                if (!(p_atomic_read(&src64[j].value) & 0x8000000000000000UL))
1525                   available = 0;
1526             }
1527          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1528 
1529          if (timed_out)
1530             result = VK_ERROR_DEVICE_LOST;
1531          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1532             result = VK_NOT_READY;
1533 
1534          num_primitives_written = p_atomic_read_relaxed(&src64[3].value) - p_atomic_read_relaxed(&src64[1].value);
1535          primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
1536 
1537          if (flags & VK_QUERY_RESULT_64_BIT) {
1538             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1539                *(uint64_t *)dest = num_primitives_written;
1540             dest += 8;
1541             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1542                *(uint64_t *)dest = primitive_storage_needed;
1543             dest += 8;
1544          } else {
1545             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1546                *(uint32_t *)dest = num_primitives_written;
1547             dest += 4;
1548             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1549                *(uint32_t *)dest = primitive_storage_needed;
1550             dest += 4;
1551          }
1552          break;
1553       }
1554       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1555          const bool uses_gds_query = pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11;
1556          p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1557          uint64_t primitive_storage_needed;
1558 
1559          /* SAMPLE_STREAMOUTSTATS stores this structure:
1560           * {
1561           *	u64 NumPrimitivesWritten;
1562           *	u64 PrimitiveStorageNeeded;
1563           * }
1564           */
1565          do {
1566             available = 1;
1567             if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
1568                 !(p_atomic_read(&src64[2].value) & 0x8000000000000000UL)) {
1569                available = 0;
1570             }
1571             if (uses_gds_query && (!(p_atomic_read(&src64[4].value) & 0x8000000000000000UL) ||
1572                                    !(p_atomic_read(&src64[5].value) & 0x8000000000000000UL))) {
1573                available = 0;
1574             }
1575          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1576 
1577          if (timed_out)
1578             result = VK_ERROR_DEVICE_LOST;
1579          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1580             result = VK_NOT_READY;
1581 
1582          primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
1583 
1584          if (uses_gds_query) {
1585             /* Accumulate the result that was copied from GDS in case NGG shader has been used. */
1586             primitive_storage_needed += p_atomic_read_relaxed(&src64[5].value) - p_atomic_read_relaxed(&src64[4].value);
1587          }
1588 
1589          if (flags & VK_QUERY_RESULT_64_BIT) {
1590             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1591                *(uint64_t *)dest = primitive_storage_needed;
1592             dest += 8;
1593          } else {
1594             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1595                *(uint32_t *)dest = primitive_storage_needed;
1596             dest += 4;
1597          }
1598          break;
1599       }
1600       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1601          struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool;
1602          const p_atomic_uint64_t *src64 = (const p_atomic_uint64_t *)src;
1603          bool avail;
1604          do {
1605             avail = true;
1606             for (unsigned i = 0; i < pc_pool->num_passes; ++i)
1607                if (!p_atomic_read(&src64[pool->stride / 8 - i - 1].value))
1608                   avail = false;
1609          } while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1610 
1611          available = avail;
1612 
1613          radv_pc_get_results(pc_pool, &src64->value, dest);
1614          dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR);
1615          break;
1616       }
1617       case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
1618          uint64_t ms_prim_gen;
1619 
1620          if (device->physical_device->rad_info.gfx_level >= GFX11) {
1621             unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1622             const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
1623 
1624             do {
1625                available = p_atomic_read(avail_ptr);
1626             } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1627                      !(timed_out = (atimeout < os_time_get_nano())));
1628 
1629             const uint64_t *start = (uint64_t *)src;
1630             const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
1631 
1632             ms_prim_gen = stop[pipeline_statistics_indices[13]] - start[pipeline_statistics_indices[13]];
1633          } else {
1634             p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1635 
1636             do {
1637                available = 1;
1638                if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
1639                    !(p_atomic_read(&src64[1].value) & 0x8000000000000000UL)) {
1640                   available = 0;
1641                }
1642             } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1643                      !(timed_out = (atimeout < os_time_get_nano())));
1644 
1645             ms_prim_gen = p_atomic_read_relaxed(&src64[1].value) - p_atomic_read_relaxed(&src64[0].value);
1646          }
1647 
1648          if (timed_out)
1649             result = VK_ERROR_DEVICE_LOST;
1650          else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1651             result = VK_NOT_READY;
1652 
1653          if (flags & VK_QUERY_RESULT_64_BIT) {
1654             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1655                *(uint64_t *)dest = ms_prim_gen;
1656             dest += 8;
1657          } else {
1658             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1659                *(uint32_t *)dest = ms_prim_gen;
1660             dest += 4;
1661          }
1662          break;
1663       }
1664       default:
1665          unreachable("trying to get results of unhandled query type");
1666       }
1667 
1668       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1669          if (flags & VK_QUERY_RESULT_64_BIT) {
1670             *(uint64_t *)dest = available;
1671          } else {
1672             *(uint32_t *)dest = available;
1673          }
1674       }
1675    }
1676 
1677    if (result == VK_ERROR_DEVICE_LOST)
1678       vk_device_set_lost(&device->vk, "GetQueryPoolResults timed out");
1679 
1680    return result;
1681 }
1682 
1683 static void
emit_query_flush(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool)1684 emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool)
1685 {
1686    if (cmd_buffer->pending_reset_query) {
1687       if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1688          /* Only need to flush caches if the query pool size is
1689           * large enough to be reset using the compute shader
1690           * path. Small pools don't need any cache flushes
1691           * because we use a CP dma clear.
1692           */
1693          radv_emit_cache_flush(cmd_buffer);
1694       }
1695    }
1696 }
1697 
1698 static size_t
radv_query_result_size(const struct radv_query_pool * pool,VkQueryResultFlags flags)1699 radv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags)
1700 {
1701    unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0;
1702    switch (pool->vk.query_type) {
1703    case VK_QUERY_TYPE_TIMESTAMP:
1704    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1705    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1706    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1707    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1708    case VK_QUERY_TYPE_OCCLUSION:
1709    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1710       values += 1;
1711       break;
1712    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1713       values += util_bitcount(pool->vk.pipeline_statistics);
1714       break;
1715    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1716       values += 2;
1717       break;
1718    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1719       values += 1;
1720       break;
1721    default:
1722       unreachable("trying to get size of unhandled query type");
1723    }
1724    return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4);
1725 }
1726 
1727 VKAPI_ATTR void VKAPI_CALL
radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1728 radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
1729                              uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride,
1730                              VkQueryResultFlags flags)
1731 {
1732    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1733    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1734    RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1735    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1736    uint64_t va = radv_buffer_get_va(pool->bo);
1737    uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1738    size_t dst_size = radv_query_result_size(pool, flags);
1739    dest_va += dst_buffer->offset + dstOffset;
1740 
1741    if (!queryCount)
1742       return;
1743 
1744    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
1745    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
1746 
1747    /* Workaround engines that forget to properly specify WAIT_BIT because some driver implicitly
1748     * synchronizes before query copy.
1749     */
1750    if (cmd_buffer->device->instance->drirc.flush_before_query_copy)
1751       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1752 
1753    /* From the Vulkan spec 1.1.108:
1754     *
1755     * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1756     *  previous uses of vkCmdResetQueryPool in the same queue, without any
1757     *  additional synchronization."
1758     *
1759     * So, we have to flush the caches if the compute shader path was used.
1760     */
1761    emit_query_flush(cmd_buffer, pool);
1762 
1763    switch (pool->vk.query_type) {
1764    case VK_QUERY_TYPE_OCCLUSION:
1765       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline, pool->bo,
1766                         dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride,
1767                         dst_size, queryCount, flags, 0, 0, false);
1768       break;
1769    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1770       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1771          const uint32_t task_invoc_offset =
1772             radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1773          const unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device);
1774 
1775          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1776             unsigned query = firstQuery + i;
1777 
1778             radeon_check_space(cmd_buffer->device->ws, cs, 7);
1779 
1780             uint64_t avail_va = va + pool->availability_offset + 4 * query;
1781 
1782             /* This waits on the ME. All copies below are done on the ME */
1783             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1784 
1785             if (pool->uses_ace && cmd_buffer->device->physical_device->emulate_mesh_shader_queries) {
1786                const uint64_t src_va = va + query * pool->stride;
1787                const uint64_t start_va = src_va + task_invoc_offset + 4;
1788                const uint64_t stop_va = start_va + pipelinestat_block_size;
1789 
1790                radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
1791 
1792                radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, start_va, 0x80000000, 0xffffffff);
1793                radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, stop_va, 0x80000000, 0xffffffff);
1794             }
1795          }
1796       }
1797       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, pool->bo,
1798                         dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride,
1799                         dst_size, queryCount, flags, pool->vk.pipeline_statistics,
1800                         pool->availability_offset + 4 * firstQuery, false);
1801       break;
1802    case VK_QUERY_TYPE_TIMESTAMP:
1803    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1804    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1805    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1806    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1807       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1808          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1809             unsigned query = firstQuery + i;
1810             uint64_t local_src_va = va + query * pool->stride;
1811 
1812             radeon_check_space(cmd_buffer->device->ws, cs, 7);
1813 
1814             /* Wait on the high 32 bits of the timestamp in
1815              * case the low part is 0xffffffff.
1816              */
1817             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4, TIMESTAMP_NOT_READY >> 32,
1818                              0xffffffff);
1819          }
1820       }
1821 
1822       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline, pool->bo,
1823                         dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride,
1824                         dst_size, queryCount, flags, 0, 0, false);
1825       break;
1826    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1827       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1828          for (unsigned i = 0; i < queryCount; i++) {
1829             unsigned query = firstQuery + i;
1830             uint64_t src_va = va + query * pool->stride;
1831 
1832             radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1833 
1834             /* Wait on the upper word of all results. */
1835             for (unsigned j = 0; j < 4; j++, src_va += 8) {
1836                radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1837             }
1838          }
1839       }
1840 
1841       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline, pool->bo, dst_buffer->bo,
1842                         firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size,
1843                         queryCount, flags, 0, 0, false);
1844       break;
1845    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1846       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1847          const bool uses_gds_query = pool->uses_gds && cmd_buffer->device->physical_device->rad_info.gfx_level < GFX11;
1848 
1849          for (unsigned i = 0; i < queryCount; i++) {
1850             unsigned query = firstQuery + i;
1851             uint64_t src_va = va + query * pool->stride;
1852 
1853             radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1854 
1855             /* Wait on the upper word of the PrimitiveStorageNeeded result. */
1856             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1857             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
1858 
1859             if (uses_gds_query) {
1860                radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 36, 0x80000000, 0xffffffff);
1861                radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 44, 0x80000000, 0xffffffff);
1862             }
1863          }
1864       }
1865 
1866       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pg_query_pipeline, pool->bo, dst_buffer->bo,
1867                         firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size,
1868                         queryCount, flags, 0, 0,
1869                         pool->uses_gds && cmd_buffer->device->physical_device->rad_info.gfx_level < GFX11);
1870       break;
1871    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1872       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1873          if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1874             for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1875                unsigned query = firstQuery + i;
1876 
1877                radeon_check_space(cmd_buffer->device->ws, cs, 7);
1878 
1879                uint64_t avail_va = va + pool->availability_offset + 4 * query;
1880 
1881                /* This waits on the ME. All copies below are done on the ME */
1882                radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1883             }
1884          }
1885          radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1886                            pool->bo, dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset,
1887                            pool->stride, stride, dst_size, queryCount, flags, 1 << 13,
1888                            pool->availability_offset + 4 * firstQuery, false);
1889       } else {
1890          if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1891             for (unsigned i = 0; i < queryCount; i++) {
1892                unsigned query = firstQuery + i;
1893                uint64_t src_va = va + query * pool->stride;
1894 
1895                radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
1896 
1897                /* Wait on the upper word. */
1898                radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1899                radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 12, 0x80000000, 0xffffffff);
1900             }
1901          }
1902 
1903          radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.ms_prim_gen_query_pipeline, pool->bo,
1904                            dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride,
1905                            stride, dst_size, queryCount, flags, 0, 0, false);
1906       }
1907       break;
1908    default:
1909       unreachable("trying to get results of unhandled query type");
1910    }
1911 }
1912 
1913 static uint32_t
query_clear_value(VkQueryType type)1914 query_clear_value(VkQueryType type)
1915 {
1916    switch (type) {
1917    case VK_QUERY_TYPE_TIMESTAMP:
1918    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1919    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1920    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1921    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1922       return (uint32_t)TIMESTAMP_NOT_READY;
1923    default:
1924       return 0;
1925    }
1926 }
1927 
1928 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1929 radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
1930 {
1931    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1932    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1933    uint32_t value = query_clear_value(pool->vk.query_type);
1934    uint32_t flush_bits = 0;
1935 
1936    /* Make sure to sync all previous work if the given command buffer has
1937     * pending active queries. Otherwise the GPU might write queries data
1938     * after the reset operation.
1939     */
1940    cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1941 
1942    flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, radv_buffer_get_va(pool->bo) + firstQuery * pool->stride,
1943                                   queryCount * pool->stride, value);
1944 
1945    if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
1946        (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT &&
1947         cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11)) {
1948       flush_bits |=
1949          radv_fill_buffer(cmd_buffer, NULL, pool->bo,
1950                           radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4, queryCount * 4, 0);
1951    }
1952 
1953    if (flush_bits) {
1954       /* Only need to flush caches for the compute shader path. */
1955       cmd_buffer->pending_reset_query = true;
1956       cmd_buffer->state.flush_bits |= flush_bits;
1957    }
1958 }
1959 
1960 VKAPI_ATTR void VKAPI_CALL
radv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1961 radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
1962 {
1963    RADV_FROM_HANDLE(radv_device, device, _device);
1964    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1965 
1966    uint32_t value = query_clear_value(pool->vk.query_type);
1967    uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride);
1968    uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1969 
1970    for (uint32_t *p = data; p != data_end; ++p)
1971       *p = value;
1972 
1973    if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
1974        (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT &&
1975         device->physical_device->rad_info.gfx_level >= GFX11)) {
1976       memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4);
1977    }
1978 }
1979 
1980 static unsigned
event_type_for_stream(unsigned stream)1981 event_type_for_stream(unsigned stream)
1982 {
1983    switch (stream) {
1984    default:
1985    case 0:
1986       return V_028A90_SAMPLE_STREAMOUTSTATS;
1987    case 1:
1988       return V_028A90_SAMPLE_STREAMOUTSTATS1;
1989    case 2:
1990       return V_028A90_SAMPLE_STREAMOUTSTATS2;
1991    case 3:
1992       return V_028A90_SAMPLE_STREAMOUTSTATS3;
1993    }
1994 }
1995 
1996 static void
emit_sample_streamout(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)1997 emit_sample_streamout(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1998 {
1999    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2000 
2001    radeon_check_space(cmd_buffer->device->ws, cs, 4);
2002 
2003    assert(index < MAX_SO_STREAMS);
2004 
2005    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2006    radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
2007    radeon_emit(cs, va);
2008    radeon_emit(cs, va >> 32);
2009 }
2010 
2011 static void
gfx10_copy_gds_query(struct radeon_cmdbuf * cs,uint32_t gds_offset,uint64_t va)2012 gfx10_copy_gds_query(struct radeon_cmdbuf *cs, uint32_t gds_offset, uint64_t va)
2013 {
2014    radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2015    radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
2016    radeon_emit(cs, gds_offset);
2017    radeon_emit(cs, 0);
2018    radeon_emit(cs, va);
2019    radeon_emit(cs, va >> 32);
2020 }
2021 
2022 static void
gfx10_copy_gds_query_gfx(struct radv_cmd_buffer * cmd_buffer,uint32_t gds_offset,uint64_t va)2023 gfx10_copy_gds_query_gfx(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va)
2024 {
2025    /* Make sure GDS is idle before copying the value. */
2026    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
2027    radv_emit_cache_flush(cmd_buffer);
2028 
2029    gfx10_copy_gds_query(cmd_buffer->cs, gds_offset, va);
2030 }
2031 
2032 static void
gfx10_copy_gds_query_ace(struct radv_cmd_buffer * cmd_buffer,uint32_t gds_offset,uint64_t va)2033 gfx10_copy_gds_query_ace(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va)
2034 {
2035    /* Make sure GDS is idle before copying the value. */
2036    cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
2037    radv_gang_cache_flush(cmd_buffer);
2038 
2039    gfx10_copy_gds_query(cmd_buffer->gang.cs, gds_offset, va);
2040 }
2041 
2042 static void
radv_update_hw_pipelinestat(struct radv_cmd_buffer * cmd_buffer)2043 radv_update_hw_pipelinestat(struct radv_cmd_buffer *cmd_buffer)
2044 {
2045    const uint32_t num_pipeline_stat_queries = radv_get_num_pipeline_stat_queries(cmd_buffer);
2046 
2047    if (num_pipeline_stat_queries == 0) {
2048       cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
2049       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
2050    } else if (num_pipeline_stat_queries == 1) {
2051       cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
2052       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
2053    }
2054 }
2055 
2056 static void
emit_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,VkQueryType query_type,VkQueryControlFlags flags,uint32_t index)2057 emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, VkQueryType query_type,
2058                  VkQueryControlFlags flags, uint32_t index)
2059 {
2060    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2061    switch (query_type) {
2062    case VK_QUERY_TYPE_OCCLUSION:
2063       radeon_check_space(cmd_buffer->device->ws, cs, 11);
2064 
2065       ++cmd_buffer->state.active_occlusion_queries;
2066       if (cmd_buffer->state.active_occlusion_queries == 1) {
2067          if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
2068             /* This is the first occlusion query, enable
2069              * the hint if the precision bit is set.
2070              */
2071             cmd_buffer->state.perfect_occlusion_queries_enabled = true;
2072          }
2073 
2074          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
2075       } else {
2076          if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) && !cmd_buffer->state.perfect_occlusion_queries_enabled) {
2077             /* This is not the first query, but this one
2078              * needs to enable precision, DB_COUNT_CONTROL
2079              * has to be updated accordingly.
2080              */
2081             cmd_buffer->state.perfect_occlusion_queries_enabled = true;
2082 
2083             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
2084          }
2085       }
2086 
2087       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 &&
2088           cmd_buffer->device->physical_device->rad_info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
2089          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
2090       } else {
2091          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2092          if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2093             radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
2094          } else {
2095             radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
2096          }
2097       }
2098       radeon_emit(cs, va);
2099       radeon_emit(cs, va >> 32);
2100       break;
2101    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
2102       radeon_check_space(cmd_buffer->device->ws, cs, 4);
2103 
2104       ++cmd_buffer->state.active_pipeline_queries;
2105 
2106       radv_update_hw_pipelinestat(cmd_buffer);
2107 
2108       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
2109          uint32_t cs_invoc_offset =
2110             radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
2111          va += cs_invoc_offset;
2112       }
2113 
2114       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2115       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2116       radeon_emit(cs, va);
2117       radeon_emit(cs, va >> 32);
2118 
2119       if (pool->uses_gds) {
2120          if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
2121             uint32_t gs_prim_offset =
2122                radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
2123 
2124             gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, va + gs_prim_offset);
2125          }
2126 
2127          if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT) {
2128             uint32_t gs_invoc_offset =
2129                radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT);
2130 
2131             gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_INVOCATION_OFFSET, va + gs_invoc_offset);
2132          }
2133 
2134          if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
2135             uint32_t mesh_invoc_offset =
2136                radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
2137 
2138             gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
2139          }
2140 
2141          /* Record that the command buffer needs GDS. */
2142          cmd_buffer->gds_needed = true;
2143 
2144          if (!cmd_buffer->state.active_pipeline_gds_queries)
2145             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2146 
2147          cmd_buffer->state.active_pipeline_gds_queries++;
2148       }
2149 
2150       if (pool->uses_ace) {
2151          uint32_t task_invoc_offset =
2152             radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
2153 
2154          if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2155             va += task_invoc_offset;
2156 
2157             radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 4);
2158 
2159             radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2160             radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2161             radeon_emit(cmd_buffer->gang.cs, va);
2162             radeon_emit(cmd_buffer->gang.cs, va >> 32);
2163          } else {
2164             radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 11);
2165 
2166             gfx10_copy_gds_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
2167             radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
2168 
2169             /* Record that the command buffer needs GDS. */
2170             cmd_buffer->gds_needed = true;
2171 
2172             if (!cmd_buffer->state.active_pipeline_ace_queries)
2173                cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2174 
2175             cmd_buffer->state.active_pipeline_ace_queries++;
2176          }
2177       }
2178       break;
2179    }
2180    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2181       if (cmd_buffer->device->physical_device->use_ngg_streamout) {
2182          /* generated prim counter */
2183          gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
2184          radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
2185 
2186          /* written prim counter */
2187          gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 8);
2188          radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
2189 
2190          /* Record that the command buffer needs GDS. */
2191          cmd_buffer->gds_needed = true;
2192 
2193          if (!cmd_buffer->state.active_prims_xfb_gds_queries)
2194             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2195 
2196          cmd_buffer->state.active_prims_xfb_gds_queries++;
2197       } else {
2198          cmd_buffer->state.active_prims_xfb_queries++;
2199 
2200          radv_update_hw_pipelinestat(cmd_buffer);
2201 
2202          emit_sample_streamout(cmd_buffer, va, index);
2203       }
2204       break;
2205    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
2206       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2207          /* On GFX11+, primitives generated query always use GDS. */
2208          gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
2209          radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
2210 
2211          /* Record that the command buffer needs GDS. */
2212          cmd_buffer->gds_needed = true;
2213 
2214          if (!cmd_buffer->state.active_prims_gen_gds_queries)
2215             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2216 
2217          cmd_buffer->state.active_prims_gen_gds_queries++;
2218       } else {
2219          if (!cmd_buffer->state.active_prims_gen_queries) {
2220             bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
2221 
2222             cmd_buffer->state.active_prims_gen_queries++;
2223 
2224             if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
2225                cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
2226             }
2227          } else {
2228             cmd_buffer->state.active_prims_gen_queries++;
2229          }
2230 
2231          radv_update_hw_pipelinestat(cmd_buffer);
2232 
2233          if (pool->uses_gds) {
2234             /* generated prim counter */
2235             gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 32);
2236             radv_cs_write_data_imm(cs, V_370_ME, va + 36, 0x80000000);
2237 
2238             /* Record that the command buffer needs GDS. */
2239             cmd_buffer->gds_needed = true;
2240 
2241             if (!cmd_buffer->state.active_prims_gen_gds_queries)
2242                cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2243 
2244             cmd_buffer->state.active_prims_gen_gds_queries++;
2245          }
2246 
2247          emit_sample_streamout(cmd_buffer, va, index);
2248       }
2249       break;
2250    }
2251    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2252       radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2253       break;
2254    }
2255    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
2256       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2257          radeon_check_space(cmd_buffer->device->ws, cs, 4);
2258 
2259          ++cmd_buffer->state.active_pipeline_queries;
2260 
2261          radv_update_hw_pipelinestat(cmd_buffer);
2262 
2263          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2264          radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2265          radeon_emit(cs, va);
2266          radeon_emit(cs, va >> 32);
2267       } else {
2268          gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va);
2269          radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
2270 
2271          /* Record that the command buffer needs GDS. */
2272          cmd_buffer->gds_needed = true;
2273 
2274          if (!cmd_buffer->state.active_prims_gen_gds_queries)
2275             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2276 
2277          cmd_buffer->state.active_prims_gen_gds_queries++;
2278       }
2279       break;
2280    }
2281    default:
2282       unreachable("beginning unhandled query type");
2283    }
2284 }
2285 
2286 static void
emit_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va,VkQueryType query_type,uint32_t index)2287 emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, uint64_t avail_va,
2288                VkQueryType query_type, uint32_t index)
2289 {
2290    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2291    switch (query_type) {
2292    case VK_QUERY_TYPE_OCCLUSION:
2293       radeon_check_space(cmd_buffer->device->ws, cs, 14);
2294 
2295       cmd_buffer->state.active_occlusion_queries--;
2296       if (cmd_buffer->state.active_occlusion_queries == 0) {
2297          /* Reset the perfect occlusion queries hint now that no
2298           * queries are active.
2299           */
2300          cmd_buffer->state.perfect_occlusion_queries_enabled = false;
2301 
2302          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
2303       }
2304 
2305       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 &&
2306           cmd_buffer->device->physical_device->rad_info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
2307          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
2308       } else {
2309          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2310          if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2311             radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
2312          } else {
2313             radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
2314          }
2315       }
2316       radeon_emit(cs, va + 8);
2317       radeon_emit(cs, (va + 8) >> 32);
2318 
2319       break;
2320    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
2321       unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device);
2322 
2323       radeon_check_space(cmd_buffer->device->ws, cs, 16);
2324 
2325       cmd_buffer->state.active_pipeline_queries--;
2326 
2327       radv_update_hw_pipelinestat(cmd_buffer);
2328 
2329       va += pipelinestat_block_size;
2330 
2331       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
2332          uint32_t cs_invoc_offset =
2333             radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
2334          va += cs_invoc_offset;
2335       }
2336 
2337       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2338       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2339       radeon_emit(cs, va);
2340       radeon_emit(cs, va >> 32);
2341 
2342       if (pool->uses_gds) {
2343          if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
2344             uint32_t gs_prim_offset =
2345                radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
2346 
2347             gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, va + gs_prim_offset);
2348          }
2349 
2350          if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT) {
2351             uint32_t gs_invoc_offset =
2352                radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT);
2353 
2354             gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_INVOCATION_OFFSET, va + gs_invoc_offset);
2355          }
2356 
2357          if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
2358             uint32_t mesh_invoc_offset =
2359                radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
2360 
2361             gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
2362          }
2363 
2364          cmd_buffer->state.active_pipeline_gds_queries--;
2365 
2366          if (!cmd_buffer->state.active_pipeline_gds_queries)
2367             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2368       }
2369 
2370       if (pool->uses_ace) {
2371          uint32_t task_invoc_offset =
2372             radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
2373 
2374          if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2375             va += task_invoc_offset;
2376 
2377             radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 4);
2378 
2379             radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2380             radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2381             radeon_emit(cmd_buffer->gang.cs, va);
2382             radeon_emit(cmd_buffer->gang.cs, va >> 32);
2383          } else {
2384             radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 11);
2385 
2386             gfx10_copy_gds_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
2387             radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
2388 
2389             cmd_buffer->state.active_pipeline_ace_queries--;
2390 
2391             if (!cmd_buffer->state.active_pipeline_ace_queries)
2392                cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2393          }
2394       }
2395 
2396       radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
2397                                    V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va,
2398                                    1, cmd_buffer->gfx9_eop_bug_va);
2399       break;
2400    }
2401    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2402       if (cmd_buffer->device->physical_device->use_ngg_streamout) {
2403          /* generated prim counter */
2404          gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
2405          radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
2406 
2407          /* written prim counter */
2408          gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 24);
2409          radv_cs_write_data_imm(cs, V_370_ME, va + 28, 0x80000000);
2410 
2411          cmd_buffer->state.active_prims_xfb_gds_queries--;
2412 
2413          if (!cmd_buffer->state.active_prims_xfb_gds_queries)
2414             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2415       } else {
2416          cmd_buffer->state.active_prims_xfb_queries--;
2417 
2418          radv_update_hw_pipelinestat(cmd_buffer);
2419 
2420          emit_sample_streamout(cmd_buffer, va + 16, index);
2421       }
2422       break;
2423    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
2424       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2425          /* On GFX11+, primitives generated query always use GDS. */
2426          gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
2427          radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
2428 
2429          cmd_buffer->state.active_prims_gen_gds_queries--;
2430 
2431          if (!cmd_buffer->state.active_prims_gen_gds_queries)
2432             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2433       } else {
2434          if (cmd_buffer->state.active_prims_gen_queries == 1) {
2435             bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
2436 
2437             cmd_buffer->state.active_prims_gen_queries--;
2438 
2439             if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
2440                cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
2441             }
2442          } else {
2443             cmd_buffer->state.active_prims_gen_queries--;
2444          }
2445 
2446          radv_update_hw_pipelinestat(cmd_buffer);
2447 
2448          if (pool->uses_gds) {
2449             /* generated prim counter */
2450             gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 40);
2451             radv_cs_write_data_imm(cs, V_370_ME, va + 44, 0x80000000);
2452 
2453             cmd_buffer->state.active_prims_gen_gds_queries--;
2454 
2455             if (!cmd_buffer->state.active_prims_gen_gds_queries)
2456                cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2457          }
2458 
2459          emit_sample_streamout(cmd_buffer, va + 16, index);
2460       }
2461       break;
2462    }
2463    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2464       radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2465       break;
2466    }
2467    case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
2468       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2469          unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device);
2470 
2471          radeon_check_space(cmd_buffer->device->ws, cs, 16);
2472 
2473          cmd_buffer->state.active_pipeline_queries--;
2474 
2475          radv_update_hw_pipelinestat(cmd_buffer);
2476 
2477          va += pipelinestat_block_size;
2478 
2479          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2480          radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2481          radeon_emit(cs, va);
2482          radeon_emit(cs, va >> 32);
2483 
2484          radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
2485                                       V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT,
2486                                       avail_va, 1, cmd_buffer->gfx9_eop_bug_va);
2487       } else {
2488          gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va + 8);
2489          radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
2490 
2491          cmd_buffer->state.active_prims_gen_gds_queries--;
2492 
2493          if (!cmd_buffer->state.active_prims_gen_gds_queries)
2494             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2495       }
2496       break;
2497    }
2498    default:
2499       unreachable("ending unhandled query type");
2500    }
2501 
2502    cmd_buffer->active_query_flush_bits |=
2503       RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2504    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
2505       cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2506    }
2507 }
2508 
2509 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)2510 radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
2511                              VkQueryControlFlags flags, uint32_t index)
2512 {
2513    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2514    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2515    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2516    uint64_t va = radv_buffer_get_va(pool->bo);
2517 
2518    radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2519 
2520    emit_query_flush(cmd_buffer, pool);
2521 
2522    va += pool->stride * query;
2523 
2524    if (pool->uses_ace) {
2525       if (!radv_gang_init(cmd_buffer))
2526          return;
2527 
2528       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->gang.cs, pool->bo);
2529    }
2530 
2531    emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, flags, index);
2532 }
2533 
2534 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)2535 radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, uint32_t index)
2536 {
2537    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2538    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2539    uint64_t va = radv_buffer_get_va(pool->bo);
2540    uint64_t avail_va = va + pool->availability_offset + 4 * query;
2541    va += pool->stride * query;
2542 
2543    /* Do not need to add the pool BO to the list because the query must
2544     * currently be active, which means the BO is already in the list.
2545     */
2546    emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, index);
2547 
2548    /*
2549     * For multiview we have to emit a query for each bit in the mask,
2550     * however the first query we emit will get the totals for all the
2551     * operations, so we don't want to get a real value in the other
2552     * queries. This emits a fake begin/end sequence so the waiting
2553     * code gets a completed query value and doesn't hang, but the
2554     * query returns 0.
2555     */
2556    if (cmd_buffer->state.render.view_mask) {
2557       for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.render.view_mask); i++) {
2558          va += pool->stride;
2559          avail_va += 4;
2560          emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, 0, 0);
2561          emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, 0);
2562       }
2563    }
2564 }
2565 
2566 void
radv_write_timestamp(struct radv_cmd_buffer * cmd_buffer,uint64_t va,VkPipelineStageFlags2 stage)2567 radv_write_timestamp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkPipelineStageFlags2 stage)
2568 {
2569    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2570 
2571    if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
2572       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2573       radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
2574                          COPY_DATA_DST_SEL(V_370_MEM));
2575       radeon_emit(cs, 0);
2576       radeon_emit(cs, 0);
2577       radeon_emit(cs, va);
2578       radeon_emit(cs, va >> 32);
2579    } else {
2580       radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
2581                                    V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_TIMESTAMP, va, 0,
2582                                    cmd_buffer->gfx9_eop_bug_va);
2583    }
2584 }
2585 
2586 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)2587 radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkQueryPool queryPool,
2588                         uint32_t query)
2589 {
2590    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2591    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2592    const unsigned num_queries = MAX2(util_bitcount(cmd_buffer->state.render.view_mask), 1);
2593    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2594    const uint64_t va = radv_buffer_get_va(pool->bo);
2595    uint64_t query_va = va + pool->stride * query;
2596 
2597    radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2598 
2599    if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
2600       if (cmd_buffer->device->instance->drirc.flush_before_timestamp_write) {
2601          radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 1);
2602          radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
2603       }
2604 
2605       for (unsigned i = 0; i < num_queries; ++i, query_va += pool->stride) {
2606          radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 3);
2607          radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
2608          radeon_emit(cs, query_va);
2609          radeon_emit(cs, query_va >> 32);
2610       }
2611       return;
2612    }
2613 
2614    if (cmd_buffer->device->instance->drirc.flush_before_timestamp_write) {
2615       /* Make sure previously launched waves have finished */
2616       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
2617    }
2618 
2619    radv_emit_cache_flush(cmd_buffer);
2620 
2621    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
2622 
2623    for (unsigned i = 0; i < num_queries; i++) {
2624       radv_write_timestamp(cmd_buffer, query_va, stage);
2625       query_va += pool->stride;
2626    }
2627 
2628    cmd_buffer->active_query_flush_bits |=
2629       RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2630    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
2631       cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2632    }
2633 
2634    assert(cmd_buffer->cs->cdw <= cdw_max);
2635 }
2636 
2637 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,uint32_t accelerationStructureCount,const VkAccelerationStructureKHR * pAccelerationStructures,VkQueryType queryType,VkQueryPool queryPool,uint32_t firstQuery)2638 radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount,
2639                                                  const VkAccelerationStructureKHR *pAccelerationStructures,
2640                                                  VkQueryType queryType, VkQueryPool queryPool, uint32_t firstQuery)
2641 {
2642    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2643    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2644    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2645    uint64_t pool_va = radv_buffer_get_va(pool->bo);
2646    uint64_t query_va = pool_va + pool->stride * firstQuery;
2647 
2648    radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2649 
2650    radv_emit_cache_flush(cmd_buffer);
2651 
2652    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 6 * accelerationStructureCount);
2653 
2654    for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
2655       RADV_FROM_HANDLE(vk_acceleration_structure, accel_struct, pAccelerationStructures[i]);
2656       uint64_t va = vk_acceleration_structure_get_va(accel_struct);
2657 
2658       switch (queryType) {
2659       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2660          va += offsetof(struct radv_accel_struct_header, compacted_size);
2661          break;
2662       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2663          va += offsetof(struct radv_accel_struct_header, serialization_size);
2664          break;
2665       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2666          va += offsetof(struct radv_accel_struct_header, instance_count);
2667          break;
2668       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2669          va += offsetof(struct radv_accel_struct_header, size);
2670          break;
2671       default:
2672          unreachable("Unhandle accel struct query type.");
2673       }
2674 
2675       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2676       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
2677                          COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
2678       radeon_emit(cs, va);
2679       radeon_emit(cs, va >> 32);
2680       radeon_emit(cs, query_va);
2681       radeon_emit(cs, query_va >> 32);
2682 
2683       query_va += pool->stride;
2684    }
2685 
2686    assert(cmd_buffer->cs->cdw <= cdw_max);
2687 }
2688