1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26 #include <assert.h>
27 #include <fcntl.h>
28 #include <stdbool.h>
29 #include <string.h>
30
31 #include "bvh/bvh.h"
32 #include "meta/radv_meta.h"
33 #include "nir/nir_builder.h"
34 #include "util/u_atomic.h"
35 #include "vulkan/vulkan_core.h"
36 #include "radv_cs.h"
37 #include "radv_private.h"
38 #include "sid.h"
39 #include "vk_acceleration_structure.h"
40 #include "vk_common_entrypoints.h"
41
42 #define TIMESTAMP_NOT_READY UINT64_MAX
43
44 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10, 13, 11, 12};
45
46 static unsigned
radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)47 radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)
48 {
49 uint32_t idx = ffs(query) - 1;
50 return pipeline_statistics_indices[idx] * 8;
51 }
52
53 static unsigned
radv_get_pipelinestat_query_size(struct radv_device * device)54 radv_get_pipelinestat_query_size(struct radv_device *device)
55 {
56 /* GFX10_3 only has 11 valid pipeline statistics queries but in order to emulate mesh/task shader
57 * invocations, it's easier to use the same size as GFX11.
58 */
59 unsigned num_results = device->physical_device->rad_info.gfx_level >= GFX10_3 ? 14 : 11;
60 return num_results * 8;
61 }
62
63 static void
radv_store_availability(nir_builder * b,nir_def * flags,nir_def * dst_buf,nir_def * offset,nir_def * value32)64 radv_store_availability(nir_builder *b, nir_def *flags, nir_def *dst_buf, nir_def *offset, nir_def *value32)
65 {
66 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
67
68 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
69
70 nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .align_mul = 8);
71
72 nir_push_else(b, NULL);
73
74 nir_store_ssbo(b, value32, dst_buf, offset);
75
76 nir_pop_if(b, NULL);
77
78 nir_pop_if(b, NULL);
79 }
80
81 static nir_shader *
build_occlusion_query_shader(struct radv_device * device)82 build_occlusion_query_shader(struct radv_device *device)
83 {
84 /* the shader this builds is roughly
85 *
86 * push constants {
87 * uint32_t flags;
88 * uint32_t dst_stride;
89 * };
90 *
91 * uint32_t src_stride = 16 * db_count;
92 *
93 * location(binding = 0) buffer dst_buf;
94 * location(binding = 1) buffer src_buf;
95 *
96 * void main() {
97 * uint64_t result = 0;
98 * uint64_t src_offset = src_stride * global_id.x;
99 * uint64_t dst_offset = dst_stride * global_id.x;
100 * bool available = true;
101 * for (int i = 0; i < db_count; ++i) {
102 * if (enabled_rb_mask & BITFIELD64_BIT(i)) {
103 * uint64_t start = src_buf[src_offset + 16 * i];
104 * uint64_t end = src_buf[src_offset + 16 * i + 8];
105 * if ((start & (1ull << 63)) && (end & (1ull << 63)))
106 * result += end - start;
107 * else
108 * available = false;
109 * }
110 * }
111 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
112 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
113 * if (flags & VK_QUERY_RESULT_64_BIT)
114 * dst_buf[dst_offset] = result;
115 * else
116 * dst_buf[dst_offset] = (uint32_t)result.
117 * }
118 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
119 * dst_buf[dst_offset + elem_size] = available;
120 * }
121 * }
122 */
123 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "occlusion_query");
124 b.shader->info.workgroup_size[0] = 64;
125
126 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
127 nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
128 nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
129 nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
130 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
131 uint64_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
132 unsigned db_count = device->physical_device->rad_info.max_render_backends;
133
134 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
135
136 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
137 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
138
139 nir_def *global_id = get_global_ids(&b, 1);
140
141 nir_def *input_stride = nir_imm_int(&b, db_count * 16);
142 nir_def *input_base = nir_imul(&b, input_stride, global_id);
143 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
144 nir_def *output_base = nir_imul(&b, output_stride, global_id);
145
146 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
147 nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
148 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
149
150 nir_def *query_result_wait = nir_test_mask(&b, flags, VK_QUERY_RESULT_WAIT_BIT);
151 nir_push_if(&b, query_result_wait);
152 {
153 /* Wait on the upper word of the last DB entry. */
154 nir_push_loop(&b);
155 {
156 const uint32_t rb_avail_offset = 16 * util_last_bit64(enabled_rb_mask) - 4;
157
158 /* Prevent the SSBO load to be moved out of the loop. */
159 nir_scoped_memory_barrier(&b, SCOPE_INVOCATION, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo);
160
161 nir_def *load_offset = nir_iadd_imm(&b, input_base, rb_avail_offset);
162 nir_def *load = nir_load_ssbo(&b, 1, 32, src_buf, load_offset, .align_mul = 4, .access = ACCESS_COHERENT);
163
164 nir_push_if(&b, nir_ige_imm(&b, load, 0x80000000));
165 {
166 nir_jump(&b, nir_jump_break);
167 }
168 nir_pop_if(&b, NULL);
169 }
170 nir_pop_loop(&b, NULL);
171 }
172 nir_pop_if(&b, NULL);
173
174 nir_push_loop(&b);
175
176 nir_def *current_outer_count = nir_load_var(&b, outer_counter);
177 radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
178
179 nir_def *enabled_cond = nir_iand_imm(&b, nir_ishl(&b, nir_imm_int64(&b, 1), current_outer_count), enabled_rb_mask);
180
181 nir_push_if(&b, nir_i2b(&b, enabled_cond));
182
183 nir_def *load_offset = nir_imul_imm(&b, current_outer_count, 16);
184 load_offset = nir_iadd(&b, input_base, load_offset);
185
186 nir_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16);
187
188 nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1);
189 nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1);
190
191 nir_def *start_done = nir_ilt_imm(&b, nir_load_var(&b, start), 0);
192 nir_def *end_done = nir_ilt_imm(&b, nir_load_var(&b, end), 0);
193
194 nir_push_if(&b, nir_iand(&b, start_done, end_done));
195
196 nir_store_var(&b, result,
197 nir_iadd(&b, nir_load_var(&b, result), nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))),
198 0x1);
199
200 nir_push_else(&b, NULL);
201
202 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
203
204 nir_pop_if(&b, NULL);
205 nir_pop_if(&b, NULL);
206 nir_pop_loop(&b, NULL);
207
208 /* Store the result if complete or if partial results have been requested. */
209
210 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
211 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
212 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
213
214 nir_push_if(&b, result_is_64bit);
215
216 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .align_mul = 8);
217
218 nir_push_else(&b, NULL);
219
220 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base, .align_mul = 8);
221
222 nir_pop_if(&b, NULL);
223 nir_pop_if(&b, NULL);
224
225 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
226 nir_b2i32(&b, nir_load_var(&b, available)));
227
228 return b.shader;
229 }
230
231 static nir_shader *
build_pipeline_statistics_query_shader(struct radv_device * device)232 build_pipeline_statistics_query_shader(struct radv_device *device)
233 {
234 unsigned pipelinestat_block_size = +radv_get_pipelinestat_query_size(device);
235
236 /* the shader this builds is roughly
237 *
238 * push constants {
239 * uint32_t flags;
240 * uint32_t dst_stride;
241 * uint32_t stats_mask;
242 * uint32_t avail_offset;
243 * };
244 *
245 * uint32_t src_stride = pipelinestat_block_size * 2;
246 *
247 * location(binding = 0) buffer dst_buf;
248 * location(binding = 1) buffer src_buf;
249 *
250 * void main() {
251 * uint64_t src_offset = src_stride * global_id.x;
252 * uint64_t dst_base = dst_stride * global_id.x;
253 * uint64_t dst_offset = dst_base;
254 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
255 * uint32_t elem_count = stats_mask >> 16;
256 * uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
257 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
258 * dst_buf[dst_offset + elem_count * elem_size] = available32;
259 * }
260 * if ((bool)available32) {
261 * // repeat 11 times:
262 * if (stats_mask & (1 << 0)) {
263 * uint64_t start = src_buf[src_offset + 8 * indices[0]];
264 * uint64_t end = src_buf[src_offset + 8 * indices[0] +
265 * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT)
266 * dst_buf[dst_offset] = result;
267 * else
268 * dst_buf[dst_offset] = (uint32_t)result.
269 * dst_offset += elem_size;
270 * }
271 * } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
272 * // Set everything to 0 as we don't know what is valid.
273 * for (int i = 0; i < elem_count; ++i)
274 * dst_buf[dst_base + elem_size * i] = 0;
275 * }
276 * }
277 */
278 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pipeline_statistics_query");
279 b.shader->info.workgroup_size[0] = 64;
280
281 nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
282 nir_variable *result = nir_local_variable_create(b.impl, glsl_int64_t_type(), "result");
283 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
284
285 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
286 nir_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12);
287 nir_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
288
289 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
290 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
291
292 nir_def *global_id = get_global_ids(&b, 1);
293
294 nir_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
295 nir_def *input_base = nir_imul(&b, input_stride, global_id);
296 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
297 nir_def *output_base = nir_imul(&b, output_stride, global_id);
298
299 avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4));
300
301 nir_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset);
302 nir_store_var(&b, available, nir_i2b(&b, available32), 0x1);
303
304 if (device->physical_device->emulate_mesh_shader_queries) {
305 nir_push_if(&b, nir_test_mask(&b, stats_mask, VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT));
306 {
307 const uint32_t idx = ffs(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT) - 1;
308
309 nir_def *avail_start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + 4);
310 nir_def *avail_start = nir_load_ssbo(&b, 1, 32, src_buf, avail_start_offset);
311
312 nir_def *avail_end_offset =
313 nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + pipelinestat_block_size + 4);
314 nir_def *avail_end = nir_load_ssbo(&b, 1, 32, src_buf, avail_end_offset);
315
316 nir_def *task_invoc_result_available =
317 nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avail_start, avail_end), 0x80000000));
318
319 nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), task_invoc_result_available), 0x1);
320 }
321 nir_pop_if(&b, NULL);
322 }
323
324 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
325 nir_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
326 nir_def *elem_count = nir_ushr_imm(&b, stats_mask, 16);
327
328 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
329 nir_b2i32(&b, nir_load_var(&b, available)));
330
331 nir_push_if(&b, nir_load_var(&b, available));
332
333 nir_store_var(&b, output_offset, output_base, 0x1);
334 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
335 nir_push_if(&b, nir_test_mask(&b, stats_mask, BITFIELD64_BIT(i)));
336
337 nir_def *start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8);
338 nir_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset);
339
340 nir_def *end_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size);
341 nir_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset);
342
343 nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
344
345 /* Store result */
346 nir_push_if(&b, result_is_64bit);
347
348 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, nir_load_var(&b, output_offset));
349
350 nir_push_else(&b, NULL);
351
352 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, nir_load_var(&b, output_offset));
353
354 nir_pop_if(&b, NULL);
355
356 nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size), 0x1);
357
358 nir_pop_if(&b, NULL);
359 }
360
361 nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
362
363 nir_push_if(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
364
365 /* Stores zeros in all outputs. */
366
367 nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
368 nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
369
370 nir_loop *loop = nir_push_loop(&b);
371
372 nir_def *current_counter = nir_load_var(&b, counter);
373 radv_break_on_count(&b, counter, elem_count);
374
375 nir_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter));
376 nir_push_if(&b, result_is_64bit);
377
378 nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem);
379
380 nir_push_else(&b, NULL);
381
382 nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem);
383
384 nir_pop_if(&b, NULL);
385
386 nir_pop_loop(&b, loop);
387 nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
388 nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
389 return b.shader;
390 }
391
392 static nir_shader *
build_tfb_query_shader(struct radv_device * device)393 build_tfb_query_shader(struct radv_device *device)
394 {
395 /* the shader this builds is roughly
396 *
397 * uint32_t src_stride = 32;
398 *
399 * location(binding = 0) buffer dst_buf;
400 * location(binding = 1) buffer src_buf;
401 *
402 * void main() {
403 * uint64_t result[2] = {};
404 * bool available = false;
405 * uint64_t src_offset = src_stride * global_id.x;
406 * uint64_t dst_offset = dst_stride * global_id.x;
407 * uint64_t *src_data = src_buf[src_offset];
408 * uint32_t avail = (src_data[0] >> 32) &
409 * (src_data[1] >> 32) &
410 * (src_data[2] >> 32) &
411 * (src_data[3] >> 32);
412 * if (avail & 0x80000000) {
413 * result[0] = src_data[3] - src_data[1];
414 * result[1] = src_data[2] - src_data[0];
415 * available = true;
416 * }
417 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
418 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
419 * if (flags & VK_QUERY_RESULT_64_BIT) {
420 * dst_buf[dst_offset] = result;
421 * } else {
422 * dst_buf[dst_offset] = (uint32_t)result;
423 * }
424 * }
425 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
426 * dst_buf[dst_offset + result_size] = available;
427 * }
428 * }
429 */
430 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "tfb_query");
431 b.shader->info.workgroup_size[0] = 64;
432
433 /* Create and initialize local variables. */
434 nir_variable *result = nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result");
435 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
436
437 nir_store_var(&b, result, nir_replicate(&b, nir_imm_int64(&b, 0), 2), 0x3);
438 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
439
440 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
441
442 /* Load resources. */
443 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
444 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
445
446 /* Compute global ID. */
447 nir_def *global_id = get_global_ids(&b, 1);
448
449 /* Compute src/dst strides. */
450 nir_def *input_stride = nir_imm_int(&b, 32);
451 nir_def *input_base = nir_imul(&b, input_stride, global_id);
452 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
453 nir_def *output_base = nir_imul(&b, output_stride, global_id);
454
455 /* Load data from the query pool. */
456 nir_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32);
457 nir_def *load2 = nir_load_ssbo(&b, 4, 32, src_buf, nir_iadd_imm(&b, input_base, 16), .align_mul = 16);
458
459 /* Check if result is available. */
460 nir_def *avails[2];
461 avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3));
462 avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3));
463 nir_def *result_is_available = nir_test_mask(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000);
464
465 /* Only compute result if available. */
466 nir_push_if(&b, result_is_available);
467
468 /* Pack values. */
469 nir_def *packed64[4];
470 packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
471 packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3)));
472 packed64[2] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
473 packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3)));
474
475 /* Compute result. */
476 nir_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]);
477 nir_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]);
478
479 nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3);
480 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
481
482 nir_pop_if(&b, NULL);
483
484 /* Determine if result is 64 or 32 bit. */
485 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
486 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
487
488 /* Store the result if complete or partial results have been requested. */
489 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
490
491 /* Store result. */
492 nir_push_if(&b, result_is_64bit);
493
494 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
495
496 nir_push_else(&b, NULL);
497
498 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
499
500 nir_pop_if(&b, NULL);
501 nir_pop_if(&b, NULL);
502
503 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
504 nir_b2i32(&b, nir_load_var(&b, available)));
505
506 return b.shader;
507 }
508
509 static nir_shader *
build_timestamp_query_shader(struct radv_device * device)510 build_timestamp_query_shader(struct radv_device *device)
511 {
512 /* the shader this builds is roughly
513 *
514 * uint32_t src_stride = 8;
515 *
516 * location(binding = 0) buffer dst_buf;
517 * location(binding = 1) buffer src_buf;
518 *
519 * void main() {
520 * uint64_t result = 0;
521 * bool available = false;
522 * uint64_t src_offset = src_stride * global_id.x;
523 * uint64_t dst_offset = dst_stride * global_id.x;
524 * uint64_t timestamp = src_buf[src_offset];
525 * if (timestamp != TIMESTAMP_NOT_READY) {
526 * result = timestamp;
527 * available = true;
528 * }
529 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
530 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
531 * if (flags & VK_QUERY_RESULT_64_BIT) {
532 * dst_buf[dst_offset] = result;
533 * } else {
534 * dst_buf[dst_offset] = (uint32_t)result;
535 * }
536 * }
537 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
538 * dst_buf[dst_offset + result_size] = available;
539 * }
540 * }
541 */
542 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "timestamp_query");
543 b.shader->info.workgroup_size[0] = 64;
544
545 /* Create and initialize local variables. */
546 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
547 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
548
549 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
550 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
551
552 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
553
554 /* Load resources. */
555 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
556 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
557
558 /* Compute global ID. */
559 nir_def *global_id = get_global_ids(&b, 1);
560
561 /* Compute src/dst strides. */
562 nir_def *input_stride = nir_imm_int(&b, 8);
563 nir_def *input_base = nir_imul(&b, input_stride, global_id);
564 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
565 nir_def *output_base = nir_imul(&b, output_stride, global_id);
566
567 /* Load data from the query pool. */
568 nir_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8);
569
570 /* Pack the timestamp. */
571 nir_def *timestamp;
572 timestamp = nir_pack_64_2x32(&b, nir_trim_vector(&b, load, 2));
573
574 /* Check if result is available. */
575 nir_def *result_is_available = nir_i2b(&b, nir_ine_imm(&b, timestamp, TIMESTAMP_NOT_READY));
576
577 /* Only store result if available. */
578 nir_push_if(&b, result_is_available);
579
580 nir_store_var(&b, result, timestamp, 0x1);
581 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
582
583 nir_pop_if(&b, NULL);
584
585 /* Determine if result is 64 or 32 bit. */
586 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
587 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
588
589 /* Store the result if complete or partial results have been requested. */
590 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
591
592 /* Store result. */
593 nir_push_if(&b, result_is_64bit);
594
595 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
596
597 nir_push_else(&b, NULL);
598
599 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
600
601 nir_pop_if(&b, NULL);
602
603 nir_pop_if(&b, NULL);
604
605 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
606 nir_b2i32(&b, nir_load_var(&b, available)));
607
608 return b.shader;
609 }
610
611 #define RADV_PGQ_STRIDE 32
612 #define RADV_PGQ_STRIDE_GDS (RADV_PGQ_STRIDE + 8 * 2)
613
614 static nir_shader *
build_pg_query_shader(struct radv_device * device)615 build_pg_query_shader(struct radv_device *device)
616 {
617 /* the shader this builds is roughly
618 *
619 * uint32_t src_stride = 32;
620 *
621 * location(binding = 0) buffer dst_buf;
622 * location(binding = 1) buffer src_buf;
623 *
624 * void main() {
625 * uint64_t result = {};
626 * bool available = false;
627 * uint64_t src_offset = src_stride * global_id.x;
628 * uint64_t dst_offset = dst_stride * global_id.x;
629 * uint64_t *src_data = src_buf[src_offset];
630 * uint32_t avail = (src_data[0] >> 32) &
631 * (src_data[2] >> 32);
632 * if (avail & 0x80000000) {
633 * result = src_data[2] - src_data[0];
634 * if (use_gds) {
635 * uint32_t ngg_gds_result = 0;
636 * ngg_gds_result += src_data[9] - src_data[8];
637 * result += (uint64_t)ngg_gds_result;
638 * }
639 * available = true;
640 * }
641 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
642 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
643 * if (flags & VK_QUERY_RESULT_64_BIT) {
644 * dst_buf[dst_offset] = result;
645 * } else {
646 * dst_buf[dst_offset] = (uint32_t)result;
647 * }
648 * }
649 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
650 * dst_buf[dst_offset + result_size] = available;
651 * }
652 * }
653 */
654 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pg_query");
655 b.shader->info.workgroup_size[0] = 64;
656
657 /* Create and initialize local variables. */
658 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
659 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
660
661 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
662 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
663
664 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
665
666 /* Load resources. */
667 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
668 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
669
670 /* Compute global ID. */
671 nir_def *global_id = get_global_ids(&b, 1);
672
673 /* Determine if the query pool uses GDS for NGG. */
674 nir_def *uses_gds = nir_i2b(&b, nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20));
675
676 /* Compute src/dst strides. */
677 nir_def *input_stride =
678 nir_bcsel(&b, uses_gds, nir_imm_int(&b, RADV_PGQ_STRIDE_GDS), nir_imm_int(&b, RADV_PGQ_STRIDE));
679 nir_def *input_base = nir_imul(&b, input_stride, global_id);
680 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
681 nir_def *output_base = nir_imul(&b, output_stride, global_id);
682
683 /* Load data from the query pool. */
684 nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
685 nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16);
686
687 /* Check if result is available. */
688 nir_def *avails[2];
689 avails[0] = nir_channel(&b, load1, 1);
690 avails[1] = nir_channel(&b, load2, 1);
691 nir_store_var(&b, available, nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000)), 0x1);
692
693 nir_push_if(&b, uses_gds);
694 {
695 nir_def *gds_avail_start = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 36), .align_mul = 4);
696 nir_def *gds_avail_end = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 44), .align_mul = 4);
697 nir_def *gds_result_available =
698 nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, gds_avail_start, gds_avail_end), 0x80000000));
699
700 nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), gds_result_available), 0x1);
701 }
702 nir_pop_if(&b, NULL);
703
704 /* Only compute result if available. */
705 nir_push_if(&b, nir_load_var(&b, available));
706
707 /* Pack values. */
708 nir_def *packed64[2];
709 packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
710 packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
711
712 /* Compute result. */
713 nir_def *primitive_storage_needed = nir_isub(&b, packed64[1], packed64[0]);
714
715 nir_store_var(&b, result, primitive_storage_needed, 0x1);
716
717 nir_push_if(&b, uses_gds);
718 {
719 nir_def *gds_start =
720 nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 4);
721 nir_def *gds_end =
722 nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 4);
723
724 nir_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
725
726 nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), nir_u2u64(&b, ngg_gds_result)), 0x1);
727 }
728 nir_pop_if(&b, NULL);
729
730 nir_pop_if(&b, NULL);
731
732 /* Determine if result is 64 or 32 bit. */
733 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
734 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
735
736 /* Store the result if complete or partial results have been requested. */
737 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
738
739 /* Store result. */
740 nir_push_if(&b, result_is_64bit);
741
742 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
743
744 nir_push_else(&b, NULL);
745
746 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
747
748 nir_pop_if(&b, NULL);
749 nir_pop_if(&b, NULL);
750
751 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
752 nir_b2i32(&b, nir_load_var(&b, available)));
753
754 return b.shader;
755 }
756
757 static nir_shader *
build_ms_prim_gen_query_shader(struct radv_device * device)758 build_ms_prim_gen_query_shader(struct radv_device *device)
759 {
760 /* the shader this builds is roughly
761 *
762 * uint32_t src_stride = 32;
763 *
764 * location(binding = 0) buffer dst_buf;
765 * location(binding = 1) buffer src_buf;
766 *
767 * void main() {
768 * uint64_t result = {};
769 * bool available = false;
770 * uint64_t src_offset = src_stride * global_id.x;
771 * uint64_t dst_offset = dst_stride * global_id.x;
772 * uint64_t *src_data = src_buf[src_offset];
773 * uint32_t avail = (src_data[0] >> 32) & (src_data[1] >> 32);
774 * if (avail & 0x80000000) {
775 * result = src_data[1] - src_data[0];
776 * available = true;
777 * }
778 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
779 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
780 * if (flags & VK_QUERY_RESULT_64_BIT) {
781 * dst_buf[dst_offset] = result;
782 * } else {
783 * dst_buf[dst_offset] = (uint32_t)result;
784 * }
785 * }
786 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
787 * dst_buf[dst_offset + result_size] = available;
788 * }
789 * }
790 */
791 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "ms_prim_gen_query");
792 b.shader->info.workgroup_size[0] = 64;
793
794 /* Create and initialize local variables. */
795 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
796 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
797
798 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
799 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
800
801 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
802
803 /* Load resources. */
804 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
805 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
806
807 /* Compute global ID. */
808 nir_def *global_id = get_global_ids(&b, 1);
809
810 /* Compute src/dst strides. */
811 nir_def *input_base = nir_imul_imm(&b, global_id, 16);
812 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
813 nir_def *output_base = nir_imul(&b, output_stride, global_id);
814
815 /* Load data from the query pool. */
816 nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
817 nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd_imm(&b, input_base, 8), .align_mul = 16);
818
819 /* Check if result is available. */
820 nir_def *avails[2];
821 avails[0] = nir_channel(&b, load1, 1);
822 avails[1] = nir_channel(&b, load2, 1);
823 nir_def *result_is_available = nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000));
824
825 /* Only compute result if available. */
826 nir_push_if(&b, result_is_available);
827
828 /* Pack values. */
829 nir_def *packed64[2];
830 packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
831 packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
832
833 /* Compute result. */
834 nir_def *ms_prim_gen = nir_isub(&b, packed64[1], packed64[0]);
835
836 nir_store_var(&b, result, ms_prim_gen, 0x1);
837
838 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
839
840 nir_pop_if(&b, NULL);
841
842 /* Determine if result is 64 or 32 bit. */
843 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
844 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
845
846 /* Store the result if complete or partial results have been requested. */
847 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
848
849 /* Store result. */
850 nir_push_if(&b, result_is_64bit);
851
852 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
853
854 nir_push_else(&b, NULL);
855
856 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
857
858 nir_pop_if(&b, NULL);
859 nir_pop_if(&b, NULL);
860
861 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
862 nir_b2i32(&b, nir_load_var(&b, available)));
863
864 return b.shader;
865 }
866
867 static VkResult
radv_device_init_meta_query_state_internal(struct radv_device * device)868 radv_device_init_meta_query_state_internal(struct radv_device *device)
869 {
870 VkResult result;
871 nir_shader *occlusion_cs = NULL;
872 nir_shader *pipeline_statistics_cs = NULL;
873 nir_shader *tfb_cs = NULL;
874 nir_shader *timestamp_cs = NULL;
875 nir_shader *pg_cs = NULL;
876 nir_shader *ms_prim_gen_cs = NULL;
877
878 mtx_lock(&device->meta_state.mtx);
879 if (device->meta_state.query.pipeline_statistics_query_pipeline) {
880 mtx_unlock(&device->meta_state.mtx);
881 return VK_SUCCESS;
882 }
883 occlusion_cs = build_occlusion_query_shader(device);
884 pipeline_statistics_cs = build_pipeline_statistics_query_shader(device);
885 tfb_cs = build_tfb_query_shader(device);
886 timestamp_cs = build_timestamp_query_shader(device);
887 pg_cs = build_pg_query_shader(device);
888
889 if (device->physical_device->emulate_mesh_shader_queries)
890 ms_prim_gen_cs = build_ms_prim_gen_query_shader(device);
891
892 VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
893 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
894 .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
895 .bindingCount = 2,
896 .pBindings = (VkDescriptorSetLayoutBinding[]){
897 {.binding = 0,
898 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
899 .descriptorCount = 1,
900 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
901 .pImmutableSamplers = NULL},
902 {.binding = 1,
903 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
904 .descriptorCount = 1,
905 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
906 .pImmutableSamplers = NULL},
907 }};
908
909 result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &occlusion_ds_create_info,
910 &device->meta_state.alloc, &device->meta_state.query.ds_layout);
911 if (result != VK_SUCCESS)
912 goto fail;
913
914 VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
915 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
916 .setLayoutCount = 1,
917 .pSetLayouts = &device->meta_state.query.ds_layout,
918 .pushConstantRangeCount = 1,
919 .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 20},
920 };
921
922 result = radv_CreatePipelineLayout(radv_device_to_handle(device), &occlusion_pl_create_info,
923 &device->meta_state.alloc, &device->meta_state.query.p_layout);
924 if (result != VK_SUCCESS)
925 goto fail;
926
927 VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
928 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
929 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
930 .module = vk_shader_module_handle_from_nir(occlusion_cs),
931 .pName = "main",
932 .pSpecializationInfo = NULL,
933 };
934
935 VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
936 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
937 .stage = occlusion_pipeline_shader_stage,
938 .flags = 0,
939 .layout = device->meta_state.query.p_layout,
940 };
941
942 result =
943 radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, &occlusion_vk_pipeline_info,
944 NULL, &device->meta_state.query.occlusion_query_pipeline);
945 if (result != VK_SUCCESS)
946 goto fail;
947
948 VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
949 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
950 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
951 .module = vk_shader_module_handle_from_nir(pipeline_statistics_cs),
952 .pName = "main",
953 .pSpecializationInfo = NULL,
954 };
955
956 VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
957 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
958 .stage = pipeline_statistics_pipeline_shader_stage,
959 .flags = 0,
960 .layout = device->meta_state.query.p_layout,
961 };
962
963 result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache,
964 &pipeline_statistics_vk_pipeline_info, NULL,
965 &device->meta_state.query.pipeline_statistics_query_pipeline);
966 if (result != VK_SUCCESS)
967 goto fail;
968
969 VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
970 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
971 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
972 .module = vk_shader_module_handle_from_nir(tfb_cs),
973 .pName = "main",
974 .pSpecializationInfo = NULL,
975 };
976
977 VkComputePipelineCreateInfo tfb_pipeline_info = {
978 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
979 .stage = tfb_pipeline_shader_stage,
980 .flags = 0,
981 .layout = device->meta_state.query.p_layout,
982 };
983
984 result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, &tfb_pipeline_info,
985 NULL, &device->meta_state.query.tfb_query_pipeline);
986 if (result != VK_SUCCESS)
987 goto fail;
988
989 VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = {
990 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
991 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
992 .module = vk_shader_module_handle_from_nir(timestamp_cs),
993 .pName = "main",
994 .pSpecializationInfo = NULL,
995 };
996
997 VkComputePipelineCreateInfo timestamp_pipeline_info = {
998 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
999 .stage = timestamp_pipeline_shader_stage,
1000 .flags = 0,
1001 .layout = device->meta_state.query.p_layout,
1002 };
1003
1004 result =
1005 radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, ×tamp_pipeline_info,
1006 NULL, &device->meta_state.query.timestamp_query_pipeline);
1007 if (result != VK_SUCCESS)
1008 goto fail;
1009
1010 VkPipelineShaderStageCreateInfo pg_pipeline_shader_stage = {
1011 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1012 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1013 .module = vk_shader_module_handle_from_nir(pg_cs),
1014 .pName = "main",
1015 .pSpecializationInfo = NULL,
1016 };
1017
1018 VkComputePipelineCreateInfo pg_pipeline_info = {
1019 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1020 .stage = pg_pipeline_shader_stage,
1021 .flags = 0,
1022 .layout = device->meta_state.query.p_layout,
1023 };
1024
1025 result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, &pg_pipeline_info,
1026 NULL, &device->meta_state.query.pg_query_pipeline);
1027
1028 if (device->physical_device->emulate_mesh_shader_queries) {
1029 VkPipelineShaderStageCreateInfo ms_prim_gen_pipeline_shader_stage = {
1030 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1031 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1032 .module = vk_shader_module_handle_from_nir(ms_prim_gen_cs),
1033 .pName = "main",
1034 .pSpecializationInfo = NULL,
1035 };
1036
1037 VkComputePipelineCreateInfo ms_prim_gen_pipeline_info = {
1038 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1039 .stage = ms_prim_gen_pipeline_shader_stage,
1040 .flags = 0,
1041 .layout = device->meta_state.query.p_layout,
1042 };
1043
1044 result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache,
1045 &ms_prim_gen_pipeline_info, NULL,
1046 &device->meta_state.query.ms_prim_gen_query_pipeline);
1047 }
1048
1049 fail:
1050 ralloc_free(occlusion_cs);
1051 ralloc_free(pipeline_statistics_cs);
1052 ralloc_free(tfb_cs);
1053 ralloc_free(pg_cs);
1054 ralloc_free(ms_prim_gen_cs);
1055 ralloc_free(timestamp_cs);
1056 mtx_unlock(&device->meta_state.mtx);
1057 return result;
1058 }
1059
1060 VkResult
radv_device_init_meta_query_state(struct radv_device * device,bool on_demand)1061 radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
1062 {
1063 if (on_demand)
1064 return VK_SUCCESS;
1065
1066 return radv_device_init_meta_query_state_internal(device);
1067 }
1068
1069 void
radv_device_finish_meta_query_state(struct radv_device * device)1070 radv_device_finish_meta_query_state(struct radv_device *device)
1071 {
1072 if (device->meta_state.query.tfb_query_pipeline)
1073 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.tfb_query_pipeline,
1074 &device->meta_state.alloc);
1075
1076 if (device->meta_state.query.pipeline_statistics_query_pipeline)
1077 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.pipeline_statistics_query_pipeline,
1078 &device->meta_state.alloc);
1079
1080 if (device->meta_state.query.occlusion_query_pipeline)
1081 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.occlusion_query_pipeline,
1082 &device->meta_state.alloc);
1083
1084 if (device->meta_state.query.timestamp_query_pipeline)
1085 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.timestamp_query_pipeline,
1086 &device->meta_state.alloc);
1087
1088 if (device->meta_state.query.pg_query_pipeline)
1089 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.pg_query_pipeline,
1090 &device->meta_state.alloc);
1091
1092 if (device->meta_state.query.ms_prim_gen_query_pipeline)
1093 radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.query.ms_prim_gen_query_pipeline,
1094 &device->meta_state.alloc);
1095
1096 if (device->meta_state.query.p_layout)
1097 radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.query.p_layout,
1098 &device->meta_state.alloc);
1099
1100 if (device->meta_state.query.ds_layout)
1101 device->vk.dispatch_table.DestroyDescriptorSetLayout(
1102 radv_device_to_handle(device), device->meta_state.query.ds_layout, &device->meta_state.alloc);
1103 }
1104
1105 static void
radv_query_shader(struct radv_cmd_buffer * cmd_buffer,VkPipeline * pipeline,struct radeon_winsys_bo * src_bo,struct radeon_winsys_bo * dst_bo,uint64_t src_offset,uint64_t dst_offset,uint32_t src_stride,uint32_t dst_stride,size_t dst_size,uint32_t count,uint32_t flags,uint32_t pipeline_stats_mask,uint32_t avail_offset,bool uses_gds)1106 radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline, struct radeon_winsys_bo *src_bo,
1107 struct radeon_winsys_bo *dst_bo, uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride,
1108 uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags, uint32_t pipeline_stats_mask,
1109 uint32_t avail_offset, bool uses_gds)
1110 {
1111 struct radv_device *device = cmd_buffer->device;
1112 struct radv_meta_saved_state saved_state;
1113 struct radv_buffer src_buffer, dst_buffer;
1114
1115 if (!*pipeline) {
1116 VkResult ret = radv_device_init_meta_query_state_internal(device);
1117 if (ret != VK_SUCCESS) {
1118 vk_command_buffer_set_error(&cmd_buffer->vk, ret);
1119 return;
1120 }
1121 }
1122
1123 /* VK_EXT_conditional_rendering says that copy commands should not be
1124 * affected by conditional rendering.
1125 */
1126 radv_meta_save(&saved_state, cmd_buffer,
1127 RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS |
1128 RADV_META_SUSPEND_PREDICATING);
1129
1130 uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset);
1131 uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size;
1132
1133 radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset);
1134 radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset);
1135
1136 radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1137
1138 radv_meta_push_descriptor_set(
1139 cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.query.p_layout, 0, /* set */
1140 2, /* descriptorWriteCount */
1141 (VkWriteDescriptorSet[]){{.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1142 .dstBinding = 0,
1143 .dstArrayElement = 0,
1144 .descriptorCount = 1,
1145 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1146 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer),
1147 .offset = 0,
1148 .range = VK_WHOLE_SIZE}},
1149 {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1150 .dstBinding = 1,
1151 .dstArrayElement = 0,
1152 .descriptorCount = 1,
1153 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1154 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer),
1155 .offset = 0,
1156 .range = VK_WHOLE_SIZE}}});
1157
1158 /* Encode the number of elements for easy access by the shader. */
1159 pipeline_stats_mask &= (1 << (radv_get_pipelinestat_query_size(device) / 8)) - 1;
1160 pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1161
1162 avail_offset -= src_offset;
1163
1164 struct {
1165 uint32_t flags;
1166 uint32_t dst_stride;
1167 uint32_t pipeline_stats_mask;
1168 uint32_t avail_offset;
1169 uint32_t uses_gds;
1170 } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset, uses_gds};
1171
1172 vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.query.p_layout,
1173 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
1174
1175 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1176
1177 if (flags & VK_QUERY_RESULT_WAIT_BIT)
1178 cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1179
1180 radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1181
1182 /* Ensure that the query copy dispatch is complete before a potential vkCmdResetPool because
1183 * there is an implicit execution dependency from each such query command to all query commands
1184 * previously submitted to the same queue.
1185 */
1186 cmd_buffer->active_query_flush_bits |=
1187 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1188
1189 radv_buffer_finish(&src_buffer);
1190 radv_buffer_finish(&dst_buffer);
1191
1192 radv_meta_restore(&saved_state, cmd_buffer);
1193 }
1194
1195 static void
radv_destroy_query_pool(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_query_pool * pool)1196 radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
1197 struct radv_query_pool *pool)
1198 {
1199 if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
1200 radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool);
1201
1202 if (pool->bo) {
1203 radv_rmv_log_bo_destroy(device, pool->bo);
1204 device->ws->buffer_destroy(device->ws, pool->bo);
1205 }
1206
1207 radv_rmv_log_resource_destroy(device, (uint64_t)radv_query_pool_to_handle(pool));
1208 vk_query_pool_finish(&pool->vk);
1209 vk_free2(&device->vk.alloc, pAllocator, pool);
1210 }
1211
1212 VkResult
radv_create_query_pool(struct radv_device * device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool,bool is_internal)1213 radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo *pCreateInfo,
1214 const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool, bool is_internal)
1215 {
1216 VkResult result;
1217 size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
1218 ? sizeof(struct radv_pc_query_pool)
1219 : sizeof(struct radv_query_pool);
1220
1221 struct radv_query_pool *pool =
1222 vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1223
1224 if (!pool)
1225 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1226
1227 vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo);
1228
1229 /* The number of primitives generated by geometry shader invocations is only counted by the
1230 * hardware if GS uses the legacy path. When NGG GS is used, the hardware can't know the number
1231 * of generated primitives and we have to increment it from the shader using a plain GDS atomic.
1232 *
1233 * The number of geometry shader invocations is correctly counted by the hardware for both NGG
1234 * and the legacy GS path but it increments for NGG VS/TES because they are merged with GS. To
1235 * avoid this counter to increment, it's also emulated.
1236 */
1237 pool->uses_gds =
1238 (device->physical_device->emulate_ngg_gs_query_pipeline_stat &&
1239 (pool->vk.pipeline_statistics & (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
1240 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT))) ||
1241 (device->physical_device->use_ngg && pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) ||
1242 (device->physical_device->emulate_mesh_shader_queries &&
1243 (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT ||
1244 pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT));
1245
1246 /* The number of task shader invocations needs to be queried on ACE. */
1247 pool->uses_ace = (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1248
1249 switch (pCreateInfo->queryType) {
1250 case VK_QUERY_TYPE_OCCLUSION:
1251 pool->stride = 16 * device->physical_device->rad_info.max_render_backends;
1252 break;
1253 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1254 pool->stride = radv_get_pipelinestat_query_size(device) * 2;
1255 break;
1256 case VK_QUERY_TYPE_TIMESTAMP:
1257 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1258 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1259 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1260 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1261 pool->stride = 8;
1262 break;
1263 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1264 pool->stride = 32;
1265 break;
1266 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1267 if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) {
1268 /* When the hardware can use both the legacy and the NGG paths in the same begin/end pair,
1269 * allocate 2x64-bit values for the GDS counters.
1270 */
1271 pool->stride = RADV_PGQ_STRIDE_GDS;
1272 } else {
1273 pool->stride = RADV_PGQ_STRIDE;
1274 }
1275 break;
1276 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1277 result = radv_pc_init_query_pool(device->physical_device, pCreateInfo, (struct radv_pc_query_pool *)pool);
1278
1279 if (result != VK_SUCCESS) {
1280 radv_destroy_query_pool(device, pAllocator, pool);
1281 return vk_error(device, result);
1282 }
1283 break;
1284 }
1285 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1286 if (device->physical_device->rad_info.gfx_level >= GFX11) {
1287 /* GFX11 natively supports mesh generated primitives with pipeline statistics. */
1288 pool->stride = radv_get_pipelinestat_query_size(device) * 2;
1289 } else {
1290 assert(device->physical_device->emulate_mesh_shader_queries);
1291 pool->stride = 16;
1292 }
1293 break;
1294 default:
1295 unreachable("creating unhandled query type");
1296 }
1297
1298 pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1299 pool->size = pool->availability_offset;
1300 if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
1301 (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT &&
1302 device->physical_device->rad_info.gfx_level >= GFX11))
1303 pool->size += 4 * pCreateInfo->queryCount;
1304
1305 result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT,
1306 RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo);
1307 if (result != VK_SUCCESS) {
1308 radv_destroy_query_pool(device, pAllocator, pool);
1309 return vk_error(device, result);
1310 }
1311
1312 pool->ptr = device->ws->buffer_map(pool->bo);
1313 if (!pool->ptr) {
1314 radv_destroy_query_pool(device, pAllocator, pool);
1315 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1316 }
1317
1318 *pQueryPool = radv_query_pool_to_handle(pool);
1319 radv_rmv_log_query_pool_create(device, *pQueryPool, is_internal);
1320 return VK_SUCCESS;
1321 }
1322
1323 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)1324 radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
1325 const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
1326 {
1327 RADV_FROM_HANDLE(radv_device, device, _device);
1328 return radv_create_query_pool(device, pCreateInfo, pAllocator, pQueryPool, false);
1329 }
1330
1331 VKAPI_ATTR void VKAPI_CALL
radv_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)1332 radv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator)
1333 {
1334 RADV_FROM_HANDLE(radv_device, device, _device);
1335 RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
1336
1337 if (!pool)
1338 return;
1339
1340 radv_destroy_query_pool(device, pAllocator, pool);
1341 }
1342
1343 static inline uint64_t
radv_get_rel_timeout_for_query(VkQueryType type)1344 radv_get_rel_timeout_for_query(VkQueryType type)
1345 {
1346 /*
1347 * Certain queries are only possible on certain types of queues
1348 * so pick the TDR timeout of the highest possible type
1349 * and double it to ensure GetQueryPoolResults completes in finite-time.
1350 *
1351 * (compute has longer TDR than gfx, other rings)
1352 */
1353 switch (type) {
1354 case VK_QUERY_TYPE_OCCLUSION:
1355 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1356 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1357 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1358 return radv_get_tdr_timeout_for_ip(AMD_IP_GFX) * 2;
1359 default:
1360 return radv_get_tdr_timeout_for_ip(AMD_IP_COMPUTE) * 2;
1361 }
1362 }
1363
1364 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)1365 radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount,
1366 size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags)
1367 {
1368 RADV_FROM_HANDLE(radv_device, device, _device);
1369 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1370 char *data = pData;
1371 VkResult result = VK_SUCCESS;
1372
1373 if (vk_device_is_lost(&device->vk))
1374 return VK_ERROR_DEVICE_LOST;
1375
1376 for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) {
1377 char *dest = data;
1378 unsigned query = firstQuery + query_idx;
1379 char *src = pool->ptr + query * pool->stride;
1380 uint32_t available;
1381 bool timed_out = false;
1382 uint64_t atimeout = os_time_get_absolute_timeout(radv_get_rel_timeout_for_query(pool->vk.query_type));
1383
1384 switch (pool->vk.query_type) {
1385 case VK_QUERY_TYPE_TIMESTAMP:
1386 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1387 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1388 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1389 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: {
1390 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1391 uint64_t value;
1392
1393 do {
1394 value = p_atomic_read(&src64->value);
1395 } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1396 !(timed_out = (atimeout < os_time_get_nano())));
1397
1398 available = value != TIMESTAMP_NOT_READY;
1399
1400 if (timed_out)
1401 result = VK_ERROR_DEVICE_LOST;
1402 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1403 result = VK_NOT_READY;
1404
1405 if (flags & VK_QUERY_RESULT_64_BIT) {
1406 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1407 *(uint64_t *)dest = value;
1408 dest += 8;
1409 } else {
1410 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1411 *(uint32_t *)dest = (uint32_t)value;
1412 dest += 4;
1413 }
1414 break;
1415 }
1416 case VK_QUERY_TYPE_OCCLUSION: {
1417 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1418 uint32_t db_count = device->physical_device->rad_info.max_render_backends;
1419 uint64_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
1420 uint64_t sample_count = 0;
1421 available = 1;
1422
1423 for (int i = 0; i < db_count; ++i) {
1424 uint64_t start, end;
1425
1426 if (!(enabled_rb_mask & (1ull << i)))
1427 continue;
1428
1429 do {
1430 start = p_atomic_read(&src64[2 * i].value);
1431 end = p_atomic_read(&src64[2 * i + 1].value);
1432 } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1433 !(timed_out = (atimeout < os_time_get_nano())));
1434
1435 if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1436 available = 0;
1437 else {
1438 sample_count += end - start;
1439 }
1440 }
1441
1442 if (timed_out)
1443 result = VK_ERROR_DEVICE_LOST;
1444 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1445 result = VK_NOT_READY;
1446
1447 if (flags & VK_QUERY_RESULT_64_BIT) {
1448 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1449 *(uint64_t *)dest = sample_count;
1450 dest += 8;
1451 } else {
1452 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1453 *(uint32_t *)dest = sample_count;
1454 dest += 4;
1455 }
1456 break;
1457 }
1458 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1459 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1460 const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
1461
1462 do {
1463 available = p_atomic_read(avail_ptr);
1464
1465 if (pool->uses_ace && device->physical_device->emulate_mesh_shader_queries) {
1466 const uint32_t task_invoc_offset =
1467 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1468 const uint32_t *avail_ptr_start = (const uint32_t *)(src + task_invoc_offset + 4);
1469 const uint32_t *avail_ptr_stop =
1470 (const uint32_t *)(src + pipelinestat_block_size + task_invoc_offset + 4);
1471
1472 if (!(p_atomic_read(avail_ptr_start) & 0x80000000) || !(p_atomic_read(avail_ptr_stop) & 0x80000000))
1473 available = 0;
1474 }
1475 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1476
1477 if (timed_out)
1478 result = VK_ERROR_DEVICE_LOST;
1479 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1480 result = VK_NOT_READY;
1481
1482 const uint64_t *start = (uint64_t *)src;
1483 const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
1484 if (flags & VK_QUERY_RESULT_64_BIT) {
1485 uint64_t *dst = (uint64_t *)dest;
1486 dest += util_bitcount(pool->vk.pipeline_statistics) * 8;
1487 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1488 if (pool->vk.pipeline_statistics & (1u << i)) {
1489 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1490 *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
1491 }
1492 dst++;
1493 }
1494 }
1495
1496 } else {
1497 uint32_t *dst = (uint32_t *)dest;
1498 dest += util_bitcount(pool->vk.pipeline_statistics) * 4;
1499 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1500 if (pool->vk.pipeline_statistics & (1u << i)) {
1501 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1502 *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
1503 }
1504 dst++;
1505 }
1506 }
1507 }
1508 break;
1509 }
1510 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1511 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1512 uint64_t num_primitives_written;
1513 uint64_t primitive_storage_needed;
1514
1515 /* SAMPLE_STREAMOUTSTATS stores this structure:
1516 * {
1517 * u64 NumPrimitivesWritten;
1518 * u64 PrimitiveStorageNeeded;
1519 * }
1520 */
1521 do {
1522 available = 1;
1523 for (int j = 0; j < 4; j++) {
1524 if (!(p_atomic_read(&src64[j].value) & 0x8000000000000000UL))
1525 available = 0;
1526 }
1527 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1528
1529 if (timed_out)
1530 result = VK_ERROR_DEVICE_LOST;
1531 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1532 result = VK_NOT_READY;
1533
1534 num_primitives_written = p_atomic_read_relaxed(&src64[3].value) - p_atomic_read_relaxed(&src64[1].value);
1535 primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
1536
1537 if (flags & VK_QUERY_RESULT_64_BIT) {
1538 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1539 *(uint64_t *)dest = num_primitives_written;
1540 dest += 8;
1541 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1542 *(uint64_t *)dest = primitive_storage_needed;
1543 dest += 8;
1544 } else {
1545 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1546 *(uint32_t *)dest = num_primitives_written;
1547 dest += 4;
1548 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1549 *(uint32_t *)dest = primitive_storage_needed;
1550 dest += 4;
1551 }
1552 break;
1553 }
1554 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1555 const bool uses_gds_query = pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11;
1556 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1557 uint64_t primitive_storage_needed;
1558
1559 /* SAMPLE_STREAMOUTSTATS stores this structure:
1560 * {
1561 * u64 NumPrimitivesWritten;
1562 * u64 PrimitiveStorageNeeded;
1563 * }
1564 */
1565 do {
1566 available = 1;
1567 if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
1568 !(p_atomic_read(&src64[2].value) & 0x8000000000000000UL)) {
1569 available = 0;
1570 }
1571 if (uses_gds_query && (!(p_atomic_read(&src64[4].value) & 0x8000000000000000UL) ||
1572 !(p_atomic_read(&src64[5].value) & 0x8000000000000000UL))) {
1573 available = 0;
1574 }
1575 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1576
1577 if (timed_out)
1578 result = VK_ERROR_DEVICE_LOST;
1579 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1580 result = VK_NOT_READY;
1581
1582 primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
1583
1584 if (uses_gds_query) {
1585 /* Accumulate the result that was copied from GDS in case NGG shader has been used. */
1586 primitive_storage_needed += p_atomic_read_relaxed(&src64[5].value) - p_atomic_read_relaxed(&src64[4].value);
1587 }
1588
1589 if (flags & VK_QUERY_RESULT_64_BIT) {
1590 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1591 *(uint64_t *)dest = primitive_storage_needed;
1592 dest += 8;
1593 } else {
1594 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1595 *(uint32_t *)dest = primitive_storage_needed;
1596 dest += 4;
1597 }
1598 break;
1599 }
1600 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1601 struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool;
1602 const p_atomic_uint64_t *src64 = (const p_atomic_uint64_t *)src;
1603 bool avail;
1604 do {
1605 avail = true;
1606 for (unsigned i = 0; i < pc_pool->num_passes; ++i)
1607 if (!p_atomic_read(&src64[pool->stride / 8 - i - 1].value))
1608 avail = false;
1609 } while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
1610
1611 available = avail;
1612
1613 radv_pc_get_results(pc_pool, &src64->value, dest);
1614 dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR);
1615 break;
1616 }
1617 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
1618 uint64_t ms_prim_gen;
1619
1620 if (device->physical_device->rad_info.gfx_level >= GFX11) {
1621 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1622 const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
1623
1624 do {
1625 available = p_atomic_read(avail_ptr);
1626 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1627 !(timed_out = (atimeout < os_time_get_nano())));
1628
1629 const uint64_t *start = (uint64_t *)src;
1630 const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
1631
1632 ms_prim_gen = stop[pipeline_statistics_indices[13]] - start[pipeline_statistics_indices[13]];
1633 } else {
1634 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
1635
1636 do {
1637 available = 1;
1638 if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
1639 !(p_atomic_read(&src64[1].value) & 0x8000000000000000UL)) {
1640 available = 0;
1641 }
1642 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
1643 !(timed_out = (atimeout < os_time_get_nano())));
1644
1645 ms_prim_gen = p_atomic_read_relaxed(&src64[1].value) - p_atomic_read_relaxed(&src64[0].value);
1646 }
1647
1648 if (timed_out)
1649 result = VK_ERROR_DEVICE_LOST;
1650 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1651 result = VK_NOT_READY;
1652
1653 if (flags & VK_QUERY_RESULT_64_BIT) {
1654 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1655 *(uint64_t *)dest = ms_prim_gen;
1656 dest += 8;
1657 } else {
1658 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1659 *(uint32_t *)dest = ms_prim_gen;
1660 dest += 4;
1661 }
1662 break;
1663 }
1664 default:
1665 unreachable("trying to get results of unhandled query type");
1666 }
1667
1668 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1669 if (flags & VK_QUERY_RESULT_64_BIT) {
1670 *(uint64_t *)dest = available;
1671 } else {
1672 *(uint32_t *)dest = available;
1673 }
1674 }
1675 }
1676
1677 if (result == VK_ERROR_DEVICE_LOST)
1678 vk_device_set_lost(&device->vk, "GetQueryPoolResults timed out");
1679
1680 return result;
1681 }
1682
1683 static void
emit_query_flush(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool)1684 emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool)
1685 {
1686 if (cmd_buffer->pending_reset_query) {
1687 if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1688 /* Only need to flush caches if the query pool size is
1689 * large enough to be reset using the compute shader
1690 * path. Small pools don't need any cache flushes
1691 * because we use a CP dma clear.
1692 */
1693 radv_emit_cache_flush(cmd_buffer);
1694 }
1695 }
1696 }
1697
1698 static size_t
radv_query_result_size(const struct radv_query_pool * pool,VkQueryResultFlags flags)1699 radv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags)
1700 {
1701 unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0;
1702 switch (pool->vk.query_type) {
1703 case VK_QUERY_TYPE_TIMESTAMP:
1704 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1705 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1706 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1707 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1708 case VK_QUERY_TYPE_OCCLUSION:
1709 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1710 values += 1;
1711 break;
1712 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1713 values += util_bitcount(pool->vk.pipeline_statistics);
1714 break;
1715 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1716 values += 2;
1717 break;
1718 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1719 values += 1;
1720 break;
1721 default:
1722 unreachable("trying to get size of unhandled query type");
1723 }
1724 return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4);
1725 }
1726
1727 VKAPI_ATTR void VKAPI_CALL
radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1728 radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
1729 uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride,
1730 VkQueryResultFlags flags)
1731 {
1732 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1733 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1734 RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1735 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1736 uint64_t va = radv_buffer_get_va(pool->bo);
1737 uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1738 size_t dst_size = radv_query_result_size(pool, flags);
1739 dest_va += dst_buffer->offset + dstOffset;
1740
1741 if (!queryCount)
1742 return;
1743
1744 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
1745 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
1746
1747 /* Workaround engines that forget to properly specify WAIT_BIT because some driver implicitly
1748 * synchronizes before query copy.
1749 */
1750 if (cmd_buffer->device->instance->drirc.flush_before_query_copy)
1751 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1752
1753 /* From the Vulkan spec 1.1.108:
1754 *
1755 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1756 * previous uses of vkCmdResetQueryPool in the same queue, without any
1757 * additional synchronization."
1758 *
1759 * So, we have to flush the caches if the compute shader path was used.
1760 */
1761 emit_query_flush(cmd_buffer, pool);
1762
1763 switch (pool->vk.query_type) {
1764 case VK_QUERY_TYPE_OCCLUSION:
1765 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline, pool->bo,
1766 dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride,
1767 dst_size, queryCount, flags, 0, 0, false);
1768 break;
1769 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1770 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1771 const uint32_t task_invoc_offset =
1772 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1773 const unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device);
1774
1775 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1776 unsigned query = firstQuery + i;
1777
1778 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1779
1780 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1781
1782 /* This waits on the ME. All copies below are done on the ME */
1783 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1784
1785 if (pool->uses_ace && cmd_buffer->device->physical_device->emulate_mesh_shader_queries) {
1786 const uint64_t src_va = va + query * pool->stride;
1787 const uint64_t start_va = src_va + task_invoc_offset + 4;
1788 const uint64_t stop_va = start_va + pipelinestat_block_size;
1789
1790 radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
1791
1792 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, start_va, 0x80000000, 0xffffffff);
1793 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, stop_va, 0x80000000, 0xffffffff);
1794 }
1795 }
1796 }
1797 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, pool->bo,
1798 dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride,
1799 dst_size, queryCount, flags, pool->vk.pipeline_statistics,
1800 pool->availability_offset + 4 * firstQuery, false);
1801 break;
1802 case VK_QUERY_TYPE_TIMESTAMP:
1803 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1804 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1805 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1806 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1807 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1808 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1809 unsigned query = firstQuery + i;
1810 uint64_t local_src_va = va + query * pool->stride;
1811
1812 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1813
1814 /* Wait on the high 32 bits of the timestamp in
1815 * case the low part is 0xffffffff.
1816 */
1817 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4, TIMESTAMP_NOT_READY >> 32,
1818 0xffffffff);
1819 }
1820 }
1821
1822 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline, pool->bo,
1823 dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride,
1824 dst_size, queryCount, flags, 0, 0, false);
1825 break;
1826 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1827 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1828 for (unsigned i = 0; i < queryCount; i++) {
1829 unsigned query = firstQuery + i;
1830 uint64_t src_va = va + query * pool->stride;
1831
1832 radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1833
1834 /* Wait on the upper word of all results. */
1835 for (unsigned j = 0; j < 4; j++, src_va += 8) {
1836 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1837 }
1838 }
1839 }
1840
1841 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline, pool->bo, dst_buffer->bo,
1842 firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size,
1843 queryCount, flags, 0, 0, false);
1844 break;
1845 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1846 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1847 const bool uses_gds_query = pool->uses_gds && cmd_buffer->device->physical_device->rad_info.gfx_level < GFX11;
1848
1849 for (unsigned i = 0; i < queryCount; i++) {
1850 unsigned query = firstQuery + i;
1851 uint64_t src_va = va + query * pool->stride;
1852
1853 radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1854
1855 /* Wait on the upper word of the PrimitiveStorageNeeded result. */
1856 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1857 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
1858
1859 if (uses_gds_query) {
1860 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 36, 0x80000000, 0xffffffff);
1861 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 44, 0x80000000, 0xffffffff);
1862 }
1863 }
1864 }
1865
1866 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pg_query_pipeline, pool->bo, dst_buffer->bo,
1867 firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, dst_size,
1868 queryCount, flags, 0, 0,
1869 pool->uses_gds && cmd_buffer->device->physical_device->rad_info.gfx_level < GFX11);
1870 break;
1871 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1872 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1873 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1874 for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1875 unsigned query = firstQuery + i;
1876
1877 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1878
1879 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1880
1881 /* This waits on the ME. All copies below are done on the ME */
1882 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1883 }
1884 }
1885 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1886 pool->bo, dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset,
1887 pool->stride, stride, dst_size, queryCount, flags, 1 << 13,
1888 pool->availability_offset + 4 * firstQuery, false);
1889 } else {
1890 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1891 for (unsigned i = 0; i < queryCount; i++) {
1892 unsigned query = firstQuery + i;
1893 uint64_t src_va = va + query * pool->stride;
1894
1895 radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
1896
1897 /* Wait on the upper word. */
1898 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1899 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 12, 0x80000000, 0xffffffff);
1900 }
1901 }
1902
1903 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.ms_prim_gen_query_pipeline, pool->bo,
1904 dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride,
1905 stride, dst_size, queryCount, flags, 0, 0, false);
1906 }
1907 break;
1908 default:
1909 unreachable("trying to get results of unhandled query type");
1910 }
1911 }
1912
1913 static uint32_t
query_clear_value(VkQueryType type)1914 query_clear_value(VkQueryType type)
1915 {
1916 switch (type) {
1917 case VK_QUERY_TYPE_TIMESTAMP:
1918 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1919 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1920 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1921 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1922 return (uint32_t)TIMESTAMP_NOT_READY;
1923 default:
1924 return 0;
1925 }
1926 }
1927
1928 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1929 radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
1930 {
1931 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1932 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1933 uint32_t value = query_clear_value(pool->vk.query_type);
1934 uint32_t flush_bits = 0;
1935
1936 /* Make sure to sync all previous work if the given command buffer has
1937 * pending active queries. Otherwise the GPU might write queries data
1938 * after the reset operation.
1939 */
1940 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1941
1942 flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, radv_buffer_get_va(pool->bo) + firstQuery * pool->stride,
1943 queryCount * pool->stride, value);
1944
1945 if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
1946 (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT &&
1947 cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11)) {
1948 flush_bits |=
1949 radv_fill_buffer(cmd_buffer, NULL, pool->bo,
1950 radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4, queryCount * 4, 0);
1951 }
1952
1953 if (flush_bits) {
1954 /* Only need to flush caches for the compute shader path. */
1955 cmd_buffer->pending_reset_query = true;
1956 cmd_buffer->state.flush_bits |= flush_bits;
1957 }
1958 }
1959
1960 VKAPI_ATTR void VKAPI_CALL
radv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1961 radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
1962 {
1963 RADV_FROM_HANDLE(radv_device, device, _device);
1964 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1965
1966 uint32_t value = query_clear_value(pool->vk.query_type);
1967 uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride);
1968 uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1969
1970 for (uint32_t *p = data; p != data_end; ++p)
1971 *p = value;
1972
1973 if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
1974 (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT &&
1975 device->physical_device->rad_info.gfx_level >= GFX11)) {
1976 memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4);
1977 }
1978 }
1979
1980 static unsigned
event_type_for_stream(unsigned stream)1981 event_type_for_stream(unsigned stream)
1982 {
1983 switch (stream) {
1984 default:
1985 case 0:
1986 return V_028A90_SAMPLE_STREAMOUTSTATS;
1987 case 1:
1988 return V_028A90_SAMPLE_STREAMOUTSTATS1;
1989 case 2:
1990 return V_028A90_SAMPLE_STREAMOUTSTATS2;
1991 case 3:
1992 return V_028A90_SAMPLE_STREAMOUTSTATS3;
1993 }
1994 }
1995
1996 static void
emit_sample_streamout(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)1997 emit_sample_streamout(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1998 {
1999 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2000
2001 radeon_check_space(cmd_buffer->device->ws, cs, 4);
2002
2003 assert(index < MAX_SO_STREAMS);
2004
2005 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2006 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
2007 radeon_emit(cs, va);
2008 radeon_emit(cs, va >> 32);
2009 }
2010
2011 static void
gfx10_copy_gds_query(struct radeon_cmdbuf * cs,uint32_t gds_offset,uint64_t va)2012 gfx10_copy_gds_query(struct radeon_cmdbuf *cs, uint32_t gds_offset, uint64_t va)
2013 {
2014 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2015 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
2016 radeon_emit(cs, gds_offset);
2017 radeon_emit(cs, 0);
2018 radeon_emit(cs, va);
2019 radeon_emit(cs, va >> 32);
2020 }
2021
2022 static void
gfx10_copy_gds_query_gfx(struct radv_cmd_buffer * cmd_buffer,uint32_t gds_offset,uint64_t va)2023 gfx10_copy_gds_query_gfx(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va)
2024 {
2025 /* Make sure GDS is idle before copying the value. */
2026 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
2027 radv_emit_cache_flush(cmd_buffer);
2028
2029 gfx10_copy_gds_query(cmd_buffer->cs, gds_offset, va);
2030 }
2031
2032 static void
gfx10_copy_gds_query_ace(struct radv_cmd_buffer * cmd_buffer,uint32_t gds_offset,uint64_t va)2033 gfx10_copy_gds_query_ace(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va)
2034 {
2035 /* Make sure GDS is idle before copying the value. */
2036 cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
2037 radv_gang_cache_flush(cmd_buffer);
2038
2039 gfx10_copy_gds_query(cmd_buffer->gang.cs, gds_offset, va);
2040 }
2041
2042 static void
radv_update_hw_pipelinestat(struct radv_cmd_buffer * cmd_buffer)2043 radv_update_hw_pipelinestat(struct radv_cmd_buffer *cmd_buffer)
2044 {
2045 const uint32_t num_pipeline_stat_queries = radv_get_num_pipeline_stat_queries(cmd_buffer);
2046
2047 if (num_pipeline_stat_queries == 0) {
2048 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
2049 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
2050 } else if (num_pipeline_stat_queries == 1) {
2051 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
2052 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
2053 }
2054 }
2055
2056 static void
emit_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,VkQueryType query_type,VkQueryControlFlags flags,uint32_t index)2057 emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, VkQueryType query_type,
2058 VkQueryControlFlags flags, uint32_t index)
2059 {
2060 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2061 switch (query_type) {
2062 case VK_QUERY_TYPE_OCCLUSION:
2063 radeon_check_space(cmd_buffer->device->ws, cs, 11);
2064
2065 ++cmd_buffer->state.active_occlusion_queries;
2066 if (cmd_buffer->state.active_occlusion_queries == 1) {
2067 if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
2068 /* This is the first occlusion query, enable
2069 * the hint if the precision bit is set.
2070 */
2071 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
2072 }
2073
2074 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
2075 } else {
2076 if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) && !cmd_buffer->state.perfect_occlusion_queries_enabled) {
2077 /* This is not the first query, but this one
2078 * needs to enable precision, DB_COUNT_CONTROL
2079 * has to be updated accordingly.
2080 */
2081 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
2082
2083 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
2084 }
2085 }
2086
2087 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 &&
2088 cmd_buffer->device->physical_device->rad_info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
2089 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
2090 } else {
2091 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2092 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2093 radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
2094 } else {
2095 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
2096 }
2097 }
2098 radeon_emit(cs, va);
2099 radeon_emit(cs, va >> 32);
2100 break;
2101 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
2102 radeon_check_space(cmd_buffer->device->ws, cs, 4);
2103
2104 ++cmd_buffer->state.active_pipeline_queries;
2105
2106 radv_update_hw_pipelinestat(cmd_buffer);
2107
2108 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
2109 uint32_t cs_invoc_offset =
2110 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
2111 va += cs_invoc_offset;
2112 }
2113
2114 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2115 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2116 radeon_emit(cs, va);
2117 radeon_emit(cs, va >> 32);
2118
2119 if (pool->uses_gds) {
2120 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
2121 uint32_t gs_prim_offset =
2122 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
2123
2124 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, va + gs_prim_offset);
2125 }
2126
2127 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT) {
2128 uint32_t gs_invoc_offset =
2129 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT);
2130
2131 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_INVOCATION_OFFSET, va + gs_invoc_offset);
2132 }
2133
2134 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
2135 uint32_t mesh_invoc_offset =
2136 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
2137
2138 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
2139 }
2140
2141 /* Record that the command buffer needs GDS. */
2142 cmd_buffer->gds_needed = true;
2143
2144 if (!cmd_buffer->state.active_pipeline_gds_queries)
2145 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2146
2147 cmd_buffer->state.active_pipeline_gds_queries++;
2148 }
2149
2150 if (pool->uses_ace) {
2151 uint32_t task_invoc_offset =
2152 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
2153
2154 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2155 va += task_invoc_offset;
2156
2157 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 4);
2158
2159 radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2160 radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2161 radeon_emit(cmd_buffer->gang.cs, va);
2162 radeon_emit(cmd_buffer->gang.cs, va >> 32);
2163 } else {
2164 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 11);
2165
2166 gfx10_copy_gds_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
2167 radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
2168
2169 /* Record that the command buffer needs GDS. */
2170 cmd_buffer->gds_needed = true;
2171
2172 if (!cmd_buffer->state.active_pipeline_ace_queries)
2173 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2174
2175 cmd_buffer->state.active_pipeline_ace_queries++;
2176 }
2177 }
2178 break;
2179 }
2180 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2181 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
2182 /* generated prim counter */
2183 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
2184 radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
2185
2186 /* written prim counter */
2187 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 8);
2188 radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
2189
2190 /* Record that the command buffer needs GDS. */
2191 cmd_buffer->gds_needed = true;
2192
2193 if (!cmd_buffer->state.active_prims_xfb_gds_queries)
2194 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2195
2196 cmd_buffer->state.active_prims_xfb_gds_queries++;
2197 } else {
2198 cmd_buffer->state.active_prims_xfb_queries++;
2199
2200 radv_update_hw_pipelinestat(cmd_buffer);
2201
2202 emit_sample_streamout(cmd_buffer, va, index);
2203 }
2204 break;
2205 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
2206 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2207 /* On GFX11+, primitives generated query always use GDS. */
2208 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
2209 radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
2210
2211 /* Record that the command buffer needs GDS. */
2212 cmd_buffer->gds_needed = true;
2213
2214 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2215 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2216
2217 cmd_buffer->state.active_prims_gen_gds_queries++;
2218 } else {
2219 if (!cmd_buffer->state.active_prims_gen_queries) {
2220 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
2221
2222 cmd_buffer->state.active_prims_gen_queries++;
2223
2224 if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
2225 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
2226 }
2227 } else {
2228 cmd_buffer->state.active_prims_gen_queries++;
2229 }
2230
2231 radv_update_hw_pipelinestat(cmd_buffer);
2232
2233 if (pool->uses_gds) {
2234 /* generated prim counter */
2235 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 32);
2236 radv_cs_write_data_imm(cs, V_370_ME, va + 36, 0x80000000);
2237
2238 /* Record that the command buffer needs GDS. */
2239 cmd_buffer->gds_needed = true;
2240
2241 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2242 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2243
2244 cmd_buffer->state.active_prims_gen_gds_queries++;
2245 }
2246
2247 emit_sample_streamout(cmd_buffer, va, index);
2248 }
2249 break;
2250 }
2251 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2252 radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2253 break;
2254 }
2255 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
2256 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2257 radeon_check_space(cmd_buffer->device->ws, cs, 4);
2258
2259 ++cmd_buffer->state.active_pipeline_queries;
2260
2261 radv_update_hw_pipelinestat(cmd_buffer);
2262
2263 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2264 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2265 radeon_emit(cs, va);
2266 radeon_emit(cs, va >> 32);
2267 } else {
2268 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va);
2269 radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
2270
2271 /* Record that the command buffer needs GDS. */
2272 cmd_buffer->gds_needed = true;
2273
2274 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2275 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2276
2277 cmd_buffer->state.active_prims_gen_gds_queries++;
2278 }
2279 break;
2280 }
2281 default:
2282 unreachable("beginning unhandled query type");
2283 }
2284 }
2285
2286 static void
emit_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va,VkQueryType query_type,uint32_t index)2287 emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, uint64_t avail_va,
2288 VkQueryType query_type, uint32_t index)
2289 {
2290 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2291 switch (query_type) {
2292 case VK_QUERY_TYPE_OCCLUSION:
2293 radeon_check_space(cmd_buffer->device->ws, cs, 14);
2294
2295 cmd_buffer->state.active_occlusion_queries--;
2296 if (cmd_buffer->state.active_occlusion_queries == 0) {
2297 /* Reset the perfect occlusion queries hint now that no
2298 * queries are active.
2299 */
2300 cmd_buffer->state.perfect_occlusion_queries_enabled = false;
2301
2302 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
2303 }
2304
2305 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11 &&
2306 cmd_buffer->device->physical_device->rad_info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
2307 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
2308 } else {
2309 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2310 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2311 radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
2312 } else {
2313 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
2314 }
2315 }
2316 radeon_emit(cs, va + 8);
2317 radeon_emit(cs, (va + 8) >> 32);
2318
2319 break;
2320 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
2321 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device);
2322
2323 radeon_check_space(cmd_buffer->device->ws, cs, 16);
2324
2325 cmd_buffer->state.active_pipeline_queries--;
2326
2327 radv_update_hw_pipelinestat(cmd_buffer);
2328
2329 va += pipelinestat_block_size;
2330
2331 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
2332 uint32_t cs_invoc_offset =
2333 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
2334 va += cs_invoc_offset;
2335 }
2336
2337 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2338 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2339 radeon_emit(cs, va);
2340 radeon_emit(cs, va >> 32);
2341
2342 if (pool->uses_gds) {
2343 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
2344 uint32_t gs_prim_offset =
2345 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
2346
2347 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, va + gs_prim_offset);
2348 }
2349
2350 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT) {
2351 uint32_t gs_invoc_offset =
2352 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT);
2353
2354 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_GS_INVOCATION_OFFSET, va + gs_invoc_offset);
2355 }
2356
2357 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
2358 uint32_t mesh_invoc_offset =
2359 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
2360
2361 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
2362 }
2363
2364 cmd_buffer->state.active_pipeline_gds_queries--;
2365
2366 if (!cmd_buffer->state.active_pipeline_gds_queries)
2367 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2368 }
2369
2370 if (pool->uses_ace) {
2371 uint32_t task_invoc_offset =
2372 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
2373
2374 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2375 va += task_invoc_offset;
2376
2377 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 4);
2378
2379 radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2380 radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2381 radeon_emit(cmd_buffer->gang.cs, va);
2382 radeon_emit(cmd_buffer->gang.cs, va >> 32);
2383 } else {
2384 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 11);
2385
2386 gfx10_copy_gds_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
2387 radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
2388
2389 cmd_buffer->state.active_pipeline_ace_queries--;
2390
2391 if (!cmd_buffer->state.active_pipeline_ace_queries)
2392 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2393 }
2394 }
2395
2396 radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
2397 V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va,
2398 1, cmd_buffer->gfx9_eop_bug_va);
2399 break;
2400 }
2401 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2402 if (cmd_buffer->device->physical_device->use_ngg_streamout) {
2403 /* generated prim counter */
2404 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
2405 radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
2406
2407 /* written prim counter */
2408 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 24);
2409 radv_cs_write_data_imm(cs, V_370_ME, va + 28, 0x80000000);
2410
2411 cmd_buffer->state.active_prims_xfb_gds_queries--;
2412
2413 if (!cmd_buffer->state.active_prims_xfb_gds_queries)
2414 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2415 } else {
2416 cmd_buffer->state.active_prims_xfb_queries--;
2417
2418 radv_update_hw_pipelinestat(cmd_buffer);
2419
2420 emit_sample_streamout(cmd_buffer, va + 16, index);
2421 }
2422 break;
2423 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
2424 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2425 /* On GFX11+, primitives generated query always use GDS. */
2426 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
2427 radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
2428
2429 cmd_buffer->state.active_prims_gen_gds_queries--;
2430
2431 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2432 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2433 } else {
2434 if (cmd_buffer->state.active_prims_gen_queries == 1) {
2435 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
2436
2437 cmd_buffer->state.active_prims_gen_queries--;
2438
2439 if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
2440 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
2441 }
2442 } else {
2443 cmd_buffer->state.active_prims_gen_queries--;
2444 }
2445
2446 radv_update_hw_pipelinestat(cmd_buffer);
2447
2448 if (pool->uses_gds) {
2449 /* generated prim counter */
2450 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 40);
2451 radv_cs_write_data_imm(cs, V_370_ME, va + 44, 0x80000000);
2452
2453 cmd_buffer->state.active_prims_gen_gds_queries--;
2454
2455 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2456 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2457 }
2458
2459 emit_sample_streamout(cmd_buffer, va + 16, index);
2460 }
2461 break;
2462 }
2463 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2464 radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2465 break;
2466 }
2467 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
2468 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
2469 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device);
2470
2471 radeon_check_space(cmd_buffer->device->ws, cs, 16);
2472
2473 cmd_buffer->state.active_pipeline_queries--;
2474
2475 radv_update_hw_pipelinestat(cmd_buffer);
2476
2477 va += pipelinestat_block_size;
2478
2479 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
2480 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
2481 radeon_emit(cs, va);
2482 radeon_emit(cs, va >> 32);
2483
2484 radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
2485 V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT,
2486 avail_va, 1, cmd_buffer->gfx9_eop_bug_va);
2487 } else {
2488 gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va + 8);
2489 radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
2490
2491 cmd_buffer->state.active_prims_gen_gds_queries--;
2492
2493 if (!cmd_buffer->state.active_prims_gen_gds_queries)
2494 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
2495 }
2496 break;
2497 }
2498 default:
2499 unreachable("ending unhandled query type");
2500 }
2501
2502 cmd_buffer->active_query_flush_bits |=
2503 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2504 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
2505 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2506 }
2507 }
2508
2509 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)2510 radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
2511 VkQueryControlFlags flags, uint32_t index)
2512 {
2513 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2514 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2515 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2516 uint64_t va = radv_buffer_get_va(pool->bo);
2517
2518 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2519
2520 emit_query_flush(cmd_buffer, pool);
2521
2522 va += pool->stride * query;
2523
2524 if (pool->uses_ace) {
2525 if (!radv_gang_init(cmd_buffer))
2526 return;
2527
2528 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->gang.cs, pool->bo);
2529 }
2530
2531 emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, flags, index);
2532 }
2533
2534 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)2535 radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, uint32_t index)
2536 {
2537 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2538 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2539 uint64_t va = radv_buffer_get_va(pool->bo);
2540 uint64_t avail_va = va + pool->availability_offset + 4 * query;
2541 va += pool->stride * query;
2542
2543 /* Do not need to add the pool BO to the list because the query must
2544 * currently be active, which means the BO is already in the list.
2545 */
2546 emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, index);
2547
2548 /*
2549 * For multiview we have to emit a query for each bit in the mask,
2550 * however the first query we emit will get the totals for all the
2551 * operations, so we don't want to get a real value in the other
2552 * queries. This emits a fake begin/end sequence so the waiting
2553 * code gets a completed query value and doesn't hang, but the
2554 * query returns 0.
2555 */
2556 if (cmd_buffer->state.render.view_mask) {
2557 for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.render.view_mask); i++) {
2558 va += pool->stride;
2559 avail_va += 4;
2560 emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, 0, 0);
2561 emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, 0);
2562 }
2563 }
2564 }
2565
2566 void
radv_write_timestamp(struct radv_cmd_buffer * cmd_buffer,uint64_t va,VkPipelineStageFlags2 stage)2567 radv_write_timestamp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkPipelineStageFlags2 stage)
2568 {
2569 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2570
2571 if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
2572 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2573 radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
2574 COPY_DATA_DST_SEL(V_370_MEM));
2575 radeon_emit(cs, 0);
2576 radeon_emit(cs, 0);
2577 radeon_emit(cs, va);
2578 radeon_emit(cs, va >> 32);
2579 } else {
2580 radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
2581 V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_TIMESTAMP, va, 0,
2582 cmd_buffer->gfx9_eop_bug_va);
2583 }
2584 }
2585
2586 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)2587 radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkQueryPool queryPool,
2588 uint32_t query)
2589 {
2590 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2591 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2592 const unsigned num_queries = MAX2(util_bitcount(cmd_buffer->state.render.view_mask), 1);
2593 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2594 const uint64_t va = radv_buffer_get_va(pool->bo);
2595 uint64_t query_va = va + pool->stride * query;
2596
2597 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2598
2599 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
2600 if (cmd_buffer->device->instance->drirc.flush_before_timestamp_write) {
2601 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 1);
2602 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
2603 }
2604
2605 for (unsigned i = 0; i < num_queries; ++i, query_va += pool->stride) {
2606 radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 3);
2607 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
2608 radeon_emit(cs, query_va);
2609 radeon_emit(cs, query_va >> 32);
2610 }
2611 return;
2612 }
2613
2614 if (cmd_buffer->device->instance->drirc.flush_before_timestamp_write) {
2615 /* Make sure previously launched waves have finished */
2616 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
2617 }
2618
2619 radv_emit_cache_flush(cmd_buffer);
2620
2621 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
2622
2623 for (unsigned i = 0; i < num_queries; i++) {
2624 radv_write_timestamp(cmd_buffer, query_va, stage);
2625 query_va += pool->stride;
2626 }
2627
2628 cmd_buffer->active_query_flush_bits |=
2629 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2630 if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
2631 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2632 }
2633
2634 assert(cmd_buffer->cs->cdw <= cdw_max);
2635 }
2636
2637 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,uint32_t accelerationStructureCount,const VkAccelerationStructureKHR * pAccelerationStructures,VkQueryType queryType,VkQueryPool queryPool,uint32_t firstQuery)2638 radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount,
2639 const VkAccelerationStructureKHR *pAccelerationStructures,
2640 VkQueryType queryType, VkQueryPool queryPool, uint32_t firstQuery)
2641 {
2642 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2643 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2644 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2645 uint64_t pool_va = radv_buffer_get_va(pool->bo);
2646 uint64_t query_va = pool_va + pool->stride * firstQuery;
2647
2648 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2649
2650 radv_emit_cache_flush(cmd_buffer);
2651
2652 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 6 * accelerationStructureCount);
2653
2654 for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
2655 RADV_FROM_HANDLE(vk_acceleration_structure, accel_struct, pAccelerationStructures[i]);
2656 uint64_t va = vk_acceleration_structure_get_va(accel_struct);
2657
2658 switch (queryType) {
2659 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2660 va += offsetof(struct radv_accel_struct_header, compacted_size);
2661 break;
2662 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2663 va += offsetof(struct radv_accel_struct_header, serialization_size);
2664 break;
2665 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2666 va += offsetof(struct radv_accel_struct_header, instance_count);
2667 break;
2668 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2669 va += offsetof(struct radv_accel_struct_header, size);
2670 break;
2671 default:
2672 unreachable("Unhandle accel struct query type.");
2673 }
2674
2675 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2676 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
2677 COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
2678 radeon_emit(cs, va);
2679 radeon_emit(cs, va >> 32);
2680 radeon_emit(cs, query_va);
2681 radeon_emit(cs, query_va >> 32);
2682
2683 query_va += pool->stride;
2684 }
2685
2686 assert(cmd_buffer->cs->cdw <= cdw_max);
2687 }
2688