1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * SPDX-License-Identifier: MIT
7 */
8
9 #include <assert.h>
10 #include <fcntl.h>
11 #include <stdbool.h>
12 #include <string.h>
13
14 #include "bvh/bvh.h"
15 #include "meta/radv_meta.h"
16 #include "nir/nir_builder.h"
17 #include "util/u_atomic.h"
18 #include "vulkan/vulkan_core.h"
19 #include "radv_cs.h"
20 #include "radv_entrypoints.h"
21 #include "radv_perfcounter.h"
22 #include "radv_query.h"
23 #include "radv_rmv.h"
24 #include "sid.h"
25 #include "vk_acceleration_structure.h"
26 #include "vk_common_entrypoints.h"
27 #include "vk_shader_module.h"
28
29 #define TIMESTAMP_NOT_READY UINT64_MAX
30
31 static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkQueryType query_type,
32 struct radeon_winsys_bo *src_bo, struct radeon_winsys_bo *dst_bo, uint64_t src_offset,
33 uint64_t dst_offset, uint32_t src_stride, uint32_t dst_stride, size_t dst_size,
34 uint32_t count, uint32_t flags, uint32_t pipeline_stats_mask, uint32_t avail_offset,
35 bool uses_emulated_queries);
36
37 static void
gfx10_copy_shader_query(struct radeon_cmdbuf * cs,uint32_t src_sel,uint64_t src_va,uint64_t dst_va)38 gfx10_copy_shader_query(struct radeon_cmdbuf *cs, uint32_t src_sel, uint64_t src_va, uint64_t dst_va)
39 {
40 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
41 radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
42 radeon_emit(cs, src_va);
43 radeon_emit(cs, src_va >> 32);
44 radeon_emit(cs, dst_va);
45 radeon_emit(cs, dst_va >> 32);
46 }
47
48 static void
gfx10_copy_shader_query_gfx(struct radv_cmd_buffer * cmd_buffer,bool use_gds,uint32_t src_offset,uint64_t dst_va)49 gfx10_copy_shader_query_gfx(struct radv_cmd_buffer *cmd_buffer, bool use_gds, uint32_t src_offset, uint64_t dst_va)
50 {
51 uint32_t src_sel;
52 uint64_t src_va;
53
54 if (use_gds) {
55 /* Make sure GDS is idle before copying the value. */
56 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
57
58 src_sel = COPY_DATA_GDS;
59 src_va = src_offset;
60 } else {
61 /* Make sure GE is idle before copying the value. */
62 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
63
64 src_sel = COPY_DATA_SRC_MEM;
65 src_va = cmd_buffer->state.shader_query_buf_va + src_offset - RADV_SHADER_QUERY_PRIM_GEN_OFFSET(0);
66 }
67
68 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2;
69 radv_emit_cache_flush(cmd_buffer);
70
71 gfx10_copy_shader_query(cmd_buffer->cs, src_sel, src_va, dst_va);
72 }
73
74 static void
gfx10_copy_shader_query_ace(struct radv_cmd_buffer * cmd_buffer,uint32_t src_offset,uint64_t src_va)75 gfx10_copy_shader_query_ace(struct radv_cmd_buffer *cmd_buffer, uint32_t src_offset, uint64_t src_va)
76 {
77 /* Make sure GDS is idle before copying the value. */
78 cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
79 radv_gang_cache_flush(cmd_buffer);
80
81 gfx10_copy_shader_query(cmd_buffer->gang.cs, COPY_DATA_GDS, src_offset, src_va);
82 }
83
84 static void
radv_store_availability(nir_builder * b,nir_def * flags,nir_def * dst_buf,nir_def * offset,nir_def * value32)85 radv_store_availability(nir_builder *b, nir_def *flags, nir_def *dst_buf, nir_def *offset, nir_def *value32)
86 {
87 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
88
89 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
90
91 nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .align_mul = 8);
92
93 nir_push_else(b, NULL);
94
95 nir_store_ssbo(b, value32, dst_buf, offset);
96
97 nir_pop_if(b, NULL);
98
99 nir_pop_if(b, NULL);
100 }
101
102 static size_t
radv_query_result_size(const struct radv_query_pool * pool,VkQueryResultFlags flags)103 radv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags)
104 {
105 unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0;
106 switch (pool->vk.query_type) {
107 case VK_QUERY_TYPE_TIMESTAMP:
108 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
109 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
110 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
111 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
112 case VK_QUERY_TYPE_OCCLUSION:
113 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
114 values += 1;
115 break;
116 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
117 values += util_bitcount(pool->vk.pipeline_statistics);
118 break;
119 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
120 values += 2;
121 break;
122 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
123 values += 1;
124 break;
125 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
126 values += 1;
127 break;
128 default:
129 unreachable("trying to get size of unhandled query type");
130 }
131 return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4);
132 }
133
134 /**
135 * Occlusion query
136 */
137 static bool
radv_occlusion_query_use_l2(const struct radv_physical_device * pdev)138 radv_occlusion_query_use_l2(const struct radv_physical_device *pdev)
139 {
140 /* Occlusion query writes don't go through L2 on GFX6-8 which means the driver would need to
141 * flush caches before every read in shaders or use MTYPE=3 (ie. uncached) in the buffer
142 * descriptor to bypass L2. Use the WAIT_REG_MEM logic instead which is easier to implement.
143 */
144 return pdev->info.gfx_level >= GFX9;
145 }
146
147 static nir_shader *
build_occlusion_query_shader(struct radv_device * device)148 build_occlusion_query_shader(struct radv_device *device)
149 {
150 /* the shader this builds is roughly
151 *
152 * push constants {
153 * uint32_t flags;
154 * uint32_t dst_stride;
155 * };
156 *
157 * uint32_t src_stride = 16 * db_count;
158 *
159 * location(binding = 0) buffer dst_buf;
160 * location(binding = 1) buffer src_buf;
161 *
162 * void main() {
163 * uint64_t result = 0;
164 * uint64_t src_offset = src_stride * global_id.x;
165 * uint64_t dst_offset = dst_stride * global_id.x;
166 * bool available = true;
167 * for (int i = 0; i < db_count; ++i) {
168 * if (enabled_rb_mask & BITFIELD64_BIT(i)) {
169 * uint64_t start = src_buf[src_offset + 16 * i];
170 * uint64_t end = src_buf[src_offset + 16 * i + 8];
171 * if ((start & (1ull << 63)) && (end & (1ull << 63)))
172 * result += end - start;
173 * else
174 * available = false;
175 * }
176 * }
177 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
178 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
179 * if (flags & VK_QUERY_RESULT_64_BIT)
180 * dst_buf[dst_offset] = result;
181 * else
182 * dst_buf[dst_offset] = (uint32_t)result.
183 * }
184 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
185 * dst_buf[dst_offset + elem_size] = available;
186 * }
187 * }
188 */
189 const struct radv_physical_device *pdev = radv_device_physical(device);
190 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "occlusion_query");
191 b.shader->info.workgroup_size[0] = 64;
192
193 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
194 nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
195 nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
196 nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
197 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
198 uint64_t enabled_rb_mask = pdev->info.enabled_rb_mask;
199 unsigned db_count = pdev->info.max_render_backends;
200
201 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
202
203 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
204 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
205
206 nir_def *global_id = get_global_ids(&b, 1);
207
208 nir_def *input_stride = nir_imm_int(&b, db_count * 16);
209 nir_def *input_base = nir_imul(&b, input_stride, global_id);
210 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
211 nir_def *output_base = nir_imul(&b, output_stride, global_id);
212
213 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
214 nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
215 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
216
217 if (radv_occlusion_query_use_l2(pdev)) {
218 nir_def *query_result_wait = nir_test_mask(&b, flags, VK_QUERY_RESULT_WAIT_BIT);
219 nir_push_if(&b, query_result_wait);
220 {
221 /* Wait on the upper word of the last DB entry. */
222 nir_push_loop(&b);
223 {
224 const uint32_t rb_avail_offset = 16 * util_last_bit64(enabled_rb_mask) - 4;
225
226 /* Prevent the SSBO load to be moved out of the loop. */
227 nir_scoped_memory_barrier(&b, SCOPE_INVOCATION, NIR_MEMORY_ACQUIRE, nir_var_mem_ssbo);
228
229 nir_def *load_offset = nir_iadd_imm(&b, input_base, rb_avail_offset);
230 nir_def *load = nir_load_ssbo(&b, 1, 32, src_buf, load_offset, .align_mul = 4, .access = ACCESS_COHERENT);
231
232 nir_push_if(&b, nir_ige_imm(&b, load, 0x80000000));
233 {
234 nir_jump(&b, nir_jump_break);
235 }
236 nir_pop_if(&b, NULL);
237 }
238 nir_pop_loop(&b, NULL);
239 }
240 nir_pop_if(&b, NULL);
241 }
242
243 nir_push_loop(&b);
244
245 nir_def *current_outer_count = nir_load_var(&b, outer_counter);
246 radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
247
248 nir_def *enabled_cond = nir_iand_imm(&b, nir_ishl(&b, nir_imm_int64(&b, 1), current_outer_count), enabled_rb_mask);
249
250 nir_push_if(&b, nir_i2b(&b, enabled_cond));
251
252 nir_def *load_offset = nir_imul_imm(&b, current_outer_count, 16);
253 load_offset = nir_iadd(&b, input_base, load_offset);
254
255 nir_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16);
256
257 nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1);
258 nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1);
259
260 nir_def *start_done = nir_ilt_imm(&b, nir_load_var(&b, start), 0);
261 nir_def *end_done = nir_ilt_imm(&b, nir_load_var(&b, end), 0);
262
263 nir_push_if(&b, nir_iand(&b, start_done, end_done));
264
265 nir_store_var(&b, result,
266 nir_iadd(&b, nir_load_var(&b, result), nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))),
267 0x1);
268
269 nir_push_else(&b, NULL);
270
271 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
272
273 nir_pop_if(&b, NULL);
274 nir_pop_if(&b, NULL);
275 nir_pop_loop(&b, NULL);
276
277 /* Store the result if complete or if partial results have been requested. */
278
279 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
280 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
281 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
282
283 nir_push_if(&b, result_is_64bit);
284
285 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .align_mul = 8);
286
287 nir_push_else(&b, NULL);
288
289 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base, .align_mul = 8);
290
291 nir_pop_if(&b, NULL);
292 nir_pop_if(&b, NULL);
293
294 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
295 nir_b2i32(&b, nir_load_var(&b, available)));
296
297 return b.shader;
298 }
299
300 static void
radv_begin_occlusion_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va,VkQueryControlFlags flags)301 radv_begin_occlusion_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkQueryControlFlags flags)
302 {
303 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
304 const struct radv_physical_device *pdev = radv_device_physical(device);
305 struct radeon_cmdbuf *cs = cmd_buffer->cs;
306
307 radeon_check_space(device->ws, cs, 11);
308
309 ++cmd_buffer->state.active_occlusion_queries;
310 if (cmd_buffer->state.active_occlusion_queries == 1) {
311 if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
312 /* This is the first occlusion query, enable
313 * the hint if the precision bit is set.
314 */
315 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
316 }
317
318 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
319 } else {
320 if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) && !cmd_buffer->state.perfect_occlusion_queries_enabled) {
321 /* This is not the first query, but this one
322 * needs to enable precision, DB_COUNT_CONTROL
323 * has to be updated accordingly.
324 */
325 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
326
327 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
328 }
329 }
330
331 if (pdev->info.gfx_level >= GFX11 && pdev->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
332 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
333 } else {
334 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
335 if (pdev->info.gfx_level >= GFX11) {
336 radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
337 } else {
338 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
339 }
340 }
341 radeon_emit(cs, va);
342 radeon_emit(cs, va >> 32);
343 }
344
345 static void
radv_end_occlusion_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va)346 radv_end_occlusion_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
347 {
348 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
349 const struct radv_physical_device *pdev = radv_device_physical(device);
350 struct radeon_cmdbuf *cs = cmd_buffer->cs;
351
352 radeon_check_space(device->ws, cs, 14);
353
354 cmd_buffer->state.active_occlusion_queries--;
355 if (cmd_buffer->state.active_occlusion_queries == 0) {
356 /* Reset the perfect occlusion queries hint now that no
357 * queries are active.
358 */
359 cmd_buffer->state.perfect_occlusion_queries_enabled = false;
360
361 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
362 }
363
364 if (pdev->info.gfx_level >= GFX11 && pdev->info.pfp_fw_version >= EVENT_WRITE_ZPASS_PFP_VERSION) {
365 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_ZPASS, 1, 0));
366 } else {
367 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
368 if (pdev->info.gfx_level >= GFX11) {
369 radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
370 } else {
371 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
372 }
373 }
374 radeon_emit(cs, va + 8);
375 radeon_emit(cs, (va + 8) >> 32);
376 }
377
378 static void
radv_copy_occlusion_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)379 radv_copy_occlusion_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
380 uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset,
381 uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
382 {
383 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
384 const struct radv_physical_device *pdev = radv_device_physical(device);
385 struct radeon_cmdbuf *cs = cmd_buffer->cs;
386 uint64_t va = radv_buffer_get_va(pool->bo);
387
388 if (!radv_occlusion_query_use_l2(pdev)) {
389 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
390 uint64_t enabled_rb_mask = pdev->info.enabled_rb_mask;
391 uint32_t rb_avail_offset = 16 * util_last_bit64(enabled_rb_mask) - 4;
392 for (unsigned i = 0; i < query_count; ++i) {
393 unsigned query = first_query + i;
394 uint64_t src_va = va + query * pool->stride + rb_avail_offset;
395
396 radeon_check_space(device->ws, cs, 7);
397
398 /* Waits on the upper word of the last DB entry */
399 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va, 0x80000000, 0xffffffff);
400 }
401 }
402 }
403
404 radv_query_shader(cmd_buffer, VK_QUERY_TYPE_OCCLUSION, pool->bo, dst_bo, first_query * pool->stride, dst_offset,
405 pool->stride, stride, dst_size, query_count, flags, 0, 0, false);
406 }
407
408 /**
409 * Pipeline stat query
410 */
411 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10, 13, 11, 12};
412
413 static unsigned
radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)414 radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)
415 {
416 uint32_t idx = ffs(query) - 1;
417 return pipeline_statistics_indices[idx] * 8;
418 }
419
420 static unsigned
radv_get_pipelinestat_query_size(struct radv_device * device)421 radv_get_pipelinestat_query_size(struct radv_device *device)
422 {
423 /* GFX10_3 only has 11 valid pipeline statistics queries but in order to emulate mesh/task shader
424 * invocations, it's easier to use the same size as GFX11.
425 */
426 const struct radv_physical_device *pdev = radv_device_physical(device);
427 unsigned num_results = pdev->info.gfx_level >= GFX10_3 ? 14 : 11;
428 return num_results * 8;
429 }
430
431 static nir_shader *
build_pipeline_statistics_query_shader(struct radv_device * device)432 build_pipeline_statistics_query_shader(struct radv_device *device)
433 {
434 unsigned pipelinestat_block_size = +radv_get_pipelinestat_query_size(device);
435
436 /* the shader this builds is roughly
437 *
438 * push constants {
439 * uint32_t flags;
440 * uint32_t dst_stride;
441 * uint32_t stats_mask;
442 * uint32_t avail_offset;
443 * };
444 *
445 * uint32_t src_stride = pipelinestat_block_size * 2;
446 *
447 * location(binding = 0) buffer dst_buf;
448 * location(binding = 1) buffer src_buf;
449 *
450 * void main() {
451 * uint64_t src_offset = src_stride * global_id.x;
452 * uint64_t dst_base = dst_stride * global_id.x;
453 * uint64_t dst_offset = dst_base;
454 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
455 * uint32_t elem_count = stats_mask >> 16;
456 * uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
457 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
458 * dst_buf[dst_offset + elem_count * elem_size] = available32;
459 * }
460 * if ((bool)available32) {
461 * // repeat 11 times:
462 * if (stats_mask & (1 << 0)) {
463 * uint64_t start = src_buf[src_offset + 8 * indices[0]];
464 * uint64_t end = src_buf[src_offset + 8 * indices[0] +
465 * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT)
466 * dst_buf[dst_offset] = result;
467 * else
468 * dst_buf[dst_offset] = (uint32_t)result.
469 * dst_offset += elem_size;
470 * }
471 * } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
472 * // Set everything to 0 as we don't know what is valid.
473 * for (int i = 0; i < elem_count; ++i)
474 * dst_buf[dst_base + elem_size * i] = 0;
475 * }
476 * }
477 */
478 const struct radv_physical_device *pdev = radv_device_physical(device);
479 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pipeline_statistics_query");
480 b.shader->info.workgroup_size[0] = 64;
481
482 nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
483 nir_variable *result = nir_local_variable_create(b.impl, glsl_int64_t_type(), "result");
484 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
485
486 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
487 nir_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12);
488 nir_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
489 nir_def *uses_emulated_queries = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
490
491 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
492 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
493
494 nir_def *global_id = get_global_ids(&b, 1);
495
496 nir_def *input_stride =
497 nir_bcsel(&b, nir_ine_imm(&b, uses_emulated_queries, 0), nir_imm_int(&b, pipelinestat_block_size * 2 + 8 * 2),
498 nir_imm_int(&b, pipelinestat_block_size * 2));
499 nir_def *input_base = nir_imul(&b, input_stride, global_id);
500
501 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
502 nir_def *output_base = nir_imul(&b, output_stride, global_id);
503
504 avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4));
505
506 nir_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset);
507 nir_store_var(&b, available, nir_i2b(&b, available32), 0x1);
508
509 if (pdev->emulate_mesh_shader_queries) {
510 nir_push_if(&b, nir_test_mask(&b, stats_mask, VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT));
511 {
512 const uint32_t idx = ffs(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT) - 1;
513
514 nir_def *avail_start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + 4);
515 nir_def *avail_start = nir_load_ssbo(&b, 1, 32, src_buf, avail_start_offset);
516
517 nir_def *avail_end_offset =
518 nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + pipelinestat_block_size + 4);
519 nir_def *avail_end = nir_load_ssbo(&b, 1, 32, src_buf, avail_end_offset);
520
521 nir_def *task_invoc_result_available =
522 nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avail_start, avail_end), 0x80000000));
523
524 nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), task_invoc_result_available), 0x1);
525 }
526 nir_pop_if(&b, NULL);
527 }
528
529 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
530 nir_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
531 nir_def *elem_count = nir_ushr_imm(&b, stats_mask, 16);
532
533 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
534 nir_b2i32(&b, nir_load_var(&b, available)));
535
536 nir_push_if(&b, nir_load_var(&b, available));
537
538 nir_store_var(&b, output_offset, output_base, 0x1);
539 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
540 nir_push_if(&b, nir_test_mask(&b, stats_mask, BITFIELD64_BIT(i)));
541
542 nir_def *start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8);
543 nir_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset);
544
545 nir_def *end_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size);
546 nir_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset);
547
548 nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
549
550 nir_push_if(&b,
551 nir_iand(&b, nir_i2b(&b, uses_emulated_queries),
552 nir_imm_bool(&b, 1u << i == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)));
553 {
554 /* Compute the emulated result if needed. */
555 nir_def *emu_start_offset = nir_iadd_imm(&b, input_base, pipelinestat_block_size * 2);
556 nir_def *emu_start = nir_load_ssbo(&b, 1, 64, src_buf, emu_start_offset);
557
558 nir_def *emu_end_offset = nir_iadd_imm(&b, input_base, pipelinestat_block_size * 2 + 8);
559 nir_def *emu_end = nir_load_ssbo(&b, 1, 64, src_buf, emu_end_offset);
560
561 nir_def *ngg_emu_result = nir_isub(&b, emu_end, emu_start);
562
563 nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_emu_result), 0x1);
564 }
565 nir_pop_if(&b, NULL);
566
567 /* Store result */
568 nir_push_if(&b, result_is_64bit);
569
570 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, nir_load_var(&b, output_offset));
571
572 nir_push_else(&b, NULL);
573
574 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, nir_load_var(&b, output_offset));
575
576 nir_pop_if(&b, NULL);
577
578 nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size), 0x1);
579
580 nir_pop_if(&b, NULL);
581 }
582
583 nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
584
585 nir_push_if(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
586
587 /* Stores zeros in all outputs. */
588
589 nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
590 nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
591
592 nir_loop *loop = nir_push_loop(&b);
593
594 nir_def *current_counter = nir_load_var(&b, counter);
595 radv_break_on_count(&b, counter, elem_count);
596
597 nir_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter));
598 nir_push_if(&b, result_is_64bit);
599
600 nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem);
601
602 nir_push_else(&b, NULL);
603
604 nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem);
605
606 nir_pop_if(&b, NULL);
607
608 nir_pop_loop(&b, loop);
609 nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
610 nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
611 return b.shader;
612 }
613
614 static void
radv_update_hw_pipelinestat(struct radv_cmd_buffer * cmd_buffer)615 radv_update_hw_pipelinestat(struct radv_cmd_buffer *cmd_buffer)
616 {
617 const uint32_t num_pipeline_stat_queries = radv_get_num_pipeline_stat_queries(cmd_buffer);
618
619 if (num_pipeline_stat_queries == 0) {
620 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
621 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
622 } else if (num_pipeline_stat_queries == 1) {
623 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
624 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
625 }
626 }
627
628 static void
radv_begin_pipeline_stat_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va)629 radv_begin_pipeline_stat_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va)
630 {
631 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
632 const struct radv_physical_device *pdev = radv_device_physical(device);
633 struct radeon_cmdbuf *cs = cmd_buffer->cs;
634
635 radeon_check_space(device->ws, cs, 4);
636
637 ++cmd_buffer->state.active_pipeline_queries;
638
639 radv_update_hw_pipelinestat(cmd_buffer);
640
641 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
642 uint32_t cs_invoc_offset =
643 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
644 va += cs_invoc_offset;
645 }
646
647 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
648 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
649 radeon_emit(cs, va);
650 radeon_emit(cs, va >> 32);
651
652 if (pool->uses_emulated_queries) {
653 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
654 const unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
655 const uint64_t prim_va = va + pipelinestat_block_size * 2;
656
657 gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, prim_va);
658 }
659
660 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
661 uint32_t mesh_invoc_offset =
662 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
663
664 gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
665 }
666
667 /* Record that the command buffer needs GDS. */
668 cmd_buffer->gds_needed = true;
669
670 if (!cmd_buffer->state.active_emulated_pipeline_queries)
671 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
672
673 cmd_buffer->state.active_emulated_pipeline_queries++;
674 }
675
676 if (pool->uses_ace) {
677 uint32_t task_invoc_offset =
678 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
679
680 if (pdev->info.gfx_level >= GFX11) {
681 va += task_invoc_offset;
682
683 radeon_check_space(device->ws, cmd_buffer->gang.cs, 4);
684
685 radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
686 radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
687 radeon_emit(cmd_buffer->gang.cs, va);
688 radeon_emit(cmd_buffer->gang.cs, va >> 32);
689 } else {
690 radeon_check_space(device->ws, cmd_buffer->gang.cs, 11);
691
692 gfx10_copy_shader_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
693 radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
694
695 /* Record that the command buffer needs GDS. */
696 cmd_buffer->gds_needed = true;
697
698 if (!cmd_buffer->state.active_pipeline_ace_queries)
699 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
700
701 cmd_buffer->state.active_pipeline_ace_queries++;
702 }
703 }
704 }
705
706 static void
radv_end_pipeline_stat_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va)707 radv_end_pipeline_stat_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va,
708 uint64_t avail_va)
709 {
710 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
711 const struct radv_physical_device *pdev = radv_device_physical(device);
712 struct radeon_cmdbuf *cs = cmd_buffer->cs;
713
714 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
715
716 radeon_check_space(device->ws, cs, 16);
717
718 cmd_buffer->state.active_pipeline_queries--;
719
720 radv_update_hw_pipelinestat(cmd_buffer);
721
722 va += pipelinestat_block_size;
723
724 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
725 uint32_t cs_invoc_offset =
726 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
727 va += cs_invoc_offset;
728 }
729
730 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
731 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
732 radeon_emit(cs, va);
733 radeon_emit(cs, va >> 32);
734
735 if (pool->uses_emulated_queries) {
736 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
737 const uint64_t prim_va = va + pipelinestat_block_size + 8;
738
739 gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_GS_PRIM_EMIT_OFFSET, prim_va);
740 }
741
742 if (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT) {
743 uint32_t mesh_invoc_offset =
744 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT);
745
746 gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_MS_INVOCATION_OFFSET, va + mesh_invoc_offset);
747 }
748
749 cmd_buffer->state.active_emulated_pipeline_queries--;
750
751 if (!cmd_buffer->state.active_emulated_pipeline_queries)
752 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
753 }
754
755 if (pool->uses_ace) {
756 uint32_t task_invoc_offset =
757 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
758
759 if (pdev->info.gfx_level >= GFX11) {
760 va += task_invoc_offset;
761
762 radeon_check_space(device->ws, cmd_buffer->gang.cs, 4);
763
764 radeon_emit(cmd_buffer->gang.cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
765 radeon_emit(cmd_buffer->gang.cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
766 radeon_emit(cmd_buffer->gang.cs, va);
767 radeon_emit(cmd_buffer->gang.cs, va >> 32);
768 } else {
769 radeon_check_space(device->ws, cmd_buffer->gang.cs, 11);
770
771 gfx10_copy_shader_query_ace(cmd_buffer, RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
772 radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + task_invoc_offset + 4, 0x80000000);
773
774 cmd_buffer->state.active_pipeline_ace_queries--;
775
776 if (!cmd_buffer->state.active_pipeline_ace_queries)
777 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
778 }
779 }
780
781 radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
782 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va);
783 }
784
785 static void
radv_copy_pipeline_stat_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)786 radv_copy_pipeline_stat_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool,
787 uint32_t first_query, uint32_t query_count, struct radeon_winsys_bo *dst_bo,
788 uint64_t dst_offset, uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
789 {
790 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
791 const struct radv_physical_device *pdev = radv_device_physical(device);
792 struct radeon_cmdbuf *cs = cmd_buffer->cs;
793 uint64_t va = radv_buffer_get_va(pool->bo);
794
795 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
796 const uint32_t task_invoc_offset =
797 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
798 const unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
799
800 for (unsigned i = 0; i < query_count; ++i) {
801 unsigned query = first_query + i;
802
803 radeon_check_space(device->ws, cs, 7);
804
805 uint64_t avail_va = va + pool->availability_offset + 4 * query;
806
807 /* This waits on the ME. All copies below are done on the ME */
808 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
809
810 if (pool->uses_ace && pdev->emulate_mesh_shader_queries) {
811 const uint64_t src_va = va + query * pool->stride;
812 const uint64_t start_va = src_va + task_invoc_offset + 4;
813 const uint64_t stop_va = start_va + pipelinestat_block_size;
814
815 radeon_check_space(device->ws, cs, 7 * 2);
816
817 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, start_va, 0x80000000, 0xffffffff);
818 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, stop_va, 0x80000000, 0xffffffff);
819 }
820 }
821 }
822
823 radv_query_shader(cmd_buffer, VK_QUERY_TYPE_PIPELINE_STATISTICS, pool->bo, dst_bo, first_query * pool->stride,
824 dst_offset, pool->stride, stride, dst_size, query_count, flags, pool->vk.pipeline_statistics,
825 pool->availability_offset + 4 * first_query, pool->uses_emulated_queries);
826 }
827
828 /**
829 * Transform feedback query
830 */
831 static nir_shader *
build_tfb_query_shader(struct radv_device * device)832 build_tfb_query_shader(struct radv_device *device)
833 {
834 /* the shader this builds is roughly
835 *
836 * uint32_t src_stride = 32;
837 *
838 * location(binding = 0) buffer dst_buf;
839 * location(binding = 1) buffer src_buf;
840 *
841 * void main() {
842 * uint64_t result[2] = {};
843 * bool available = false;
844 * uint64_t src_offset = src_stride * global_id.x;
845 * uint64_t dst_offset = dst_stride * global_id.x;
846 * uint64_t *src_data = src_buf[src_offset];
847 * uint32_t avail = (src_data[0] >> 32) &
848 * (src_data[1] >> 32) &
849 * (src_data[2] >> 32) &
850 * (src_data[3] >> 32);
851 * if (avail & 0x80000000) {
852 * result[0] = src_data[3] - src_data[1];
853 * result[1] = src_data[2] - src_data[0];
854 * available = true;
855 * }
856 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
857 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
858 * if (flags & VK_QUERY_RESULT_64_BIT) {
859 * dst_buf[dst_offset] = result;
860 * } else {
861 * dst_buf[dst_offset] = (uint32_t)result;
862 * }
863 * }
864 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
865 * dst_buf[dst_offset + result_size] = available;
866 * }
867 * }
868 */
869 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "tfb_query");
870 b.shader->info.workgroup_size[0] = 64;
871
872 /* Create and initialize local variables. */
873 nir_variable *result = nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result");
874 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
875
876 nir_store_var(&b, result, nir_replicate(&b, nir_imm_int64(&b, 0), 2), 0x3);
877 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
878
879 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
880
881 /* Load resources. */
882 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
883 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
884
885 /* Compute global ID. */
886 nir_def *global_id = get_global_ids(&b, 1);
887
888 /* Compute src/dst strides. */
889 nir_def *input_stride = nir_imm_int(&b, 32);
890 nir_def *input_base = nir_imul(&b, input_stride, global_id);
891 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
892 nir_def *output_base = nir_imul(&b, output_stride, global_id);
893
894 /* Load data from the query pool. */
895 nir_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32);
896 nir_def *load2 = nir_load_ssbo(&b, 4, 32, src_buf, nir_iadd_imm(&b, input_base, 16), .align_mul = 16);
897
898 /* Check if result is available. */
899 nir_def *avails[2];
900 avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3));
901 avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3));
902 nir_def *result_is_available = nir_test_mask(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000);
903
904 /* Only compute result if available. */
905 nir_push_if(&b, result_is_available);
906
907 /* Pack values. */
908 nir_def *packed64[4];
909 packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
910 packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3)));
911 packed64[2] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
912 packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3)));
913
914 /* Compute result. */
915 nir_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]);
916 nir_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]);
917
918 nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3);
919 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
920
921 nir_pop_if(&b, NULL);
922
923 /* Determine if result is 64 or 32 bit. */
924 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
925 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
926
927 /* Store the result if complete or partial results have been requested. */
928 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
929
930 /* Store result. */
931 nir_push_if(&b, result_is_64bit);
932
933 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
934
935 nir_push_else(&b, NULL);
936
937 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
938
939 nir_pop_if(&b, NULL);
940 nir_pop_if(&b, NULL);
941
942 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
943 nir_b2i32(&b, nir_load_var(&b, available)));
944
945 return b.shader;
946 }
947
948 static unsigned
event_type_for_stream(unsigned stream)949 event_type_for_stream(unsigned stream)
950 {
951 switch (stream) {
952 default:
953 case 0:
954 return V_028A90_SAMPLE_STREAMOUTSTATS;
955 case 1:
956 return V_028A90_SAMPLE_STREAMOUTSTATS1;
957 case 2:
958 return V_028A90_SAMPLE_STREAMOUTSTATS2;
959 case 3:
960 return V_028A90_SAMPLE_STREAMOUTSTATS3;
961 }
962 }
963
964 static void
emit_sample_streamout(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)965 emit_sample_streamout(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
966 {
967 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
968 struct radeon_cmdbuf *cs = cmd_buffer->cs;
969
970 radeon_check_space(device->ws, cs, 4);
971
972 assert(index < MAX_SO_STREAMS);
973
974 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
975 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
976 radeon_emit(cs, va);
977 radeon_emit(cs, va >> 32);
978 }
979
980 static void
radv_alloc_shader_query_buf(struct radv_cmd_buffer * cmd_buffer)981 radv_alloc_shader_query_buf(struct radv_cmd_buffer *cmd_buffer)
982 {
983 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
984 const struct radv_physical_device *pdev = radv_device_physical(device);
985 unsigned offset;
986 void *ptr;
987
988 assert(pdev->info.gfx_level >= GFX12);
989
990 if (cmd_buffer->state.shader_query_buf_va)
991 return;
992
993 if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, RADV_SHADER_QUERY_BUF_SIZE, 64, &offset, &ptr))
994 return;
995
996 memset(ptr, 0, RADV_SHADER_QUERY_BUF_SIZE);
997
998 cmd_buffer->state.shader_query_buf_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
999 cmd_buffer->state.shader_query_buf_va += offset;
1000 }
1001
1002 static void
radv_begin_tfb_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)1003 radv_begin_tfb_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1004 {
1005 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1006 const struct radv_physical_device *pdev = radv_device_physical(device);
1007 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1008
1009 if (pdev->use_ngg_streamout) {
1010 const bool uses_gds = pdev->info.gfx_level < GFX12;
1011
1012 /* generated prim counter */
1013 gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
1014 radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
1015
1016 /* written prim counter */
1017 gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 8);
1018 radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
1019
1020 /* Record that the command buffer needs GDS. */
1021 cmd_buffer->gds_needed |= uses_gds;
1022
1023 if (!cmd_buffer->state.active_emulated_prims_xfb_queries)
1024 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1025
1026 cmd_buffer->state.active_emulated_prims_xfb_queries++;
1027 } else {
1028 cmd_buffer->state.active_prims_xfb_queries++;
1029
1030 radv_update_hw_pipelinestat(cmd_buffer);
1031
1032 emit_sample_streamout(cmd_buffer, va, index);
1033 }
1034 }
1035
1036 static void
radv_end_tfb_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)1037 radv_end_tfb_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1038 {
1039 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1040 const struct radv_physical_device *pdev = radv_device_physical(device);
1041 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1042
1043 if (pdev->use_ngg_streamout) {
1044 const bool uses_gds = pdev->info.gfx_level < GFX12;
1045
1046 /* generated prim counter */
1047 gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
1048 radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
1049
1050 /* written prim counter */
1051 gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_XFB_OFFSET(index), va + 24);
1052 radv_cs_write_data_imm(cs, V_370_ME, va + 28, 0x80000000);
1053
1054 cmd_buffer->state.active_emulated_prims_xfb_queries--;
1055
1056 if (!cmd_buffer->state.active_emulated_prims_xfb_queries)
1057 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1058 } else {
1059 cmd_buffer->state.active_prims_xfb_queries--;
1060
1061 radv_update_hw_pipelinestat(cmd_buffer);
1062
1063 emit_sample_streamout(cmd_buffer, va + 16, index);
1064 }
1065 }
1066
1067 static void
radv_copy_tfb_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)1068 radv_copy_tfb_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
1069 uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset,
1070 uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
1071 {
1072 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1073 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1074 uint64_t va = radv_buffer_get_va(pool->bo);
1075
1076 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1077 for (unsigned i = 0; i < query_count; i++) {
1078 unsigned query = first_query + i;
1079 uint64_t src_va = va + query * pool->stride;
1080
1081 radeon_check_space(device->ws, cs, 7 * 4);
1082
1083 /* Wait on the upper word of all results. */
1084 for (unsigned j = 0; j < 4; j++, src_va += 8) {
1085 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1086 }
1087 }
1088 }
1089
1090 radv_query_shader(cmd_buffer, VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT, pool->bo, dst_bo,
1091 first_query * pool->stride, dst_offset, pool->stride, stride, dst_size, query_count, flags, 0, 0,
1092 false);
1093 }
1094
1095 /**
1096 * Timestamp query
1097 */
1098 static nir_shader *
build_timestamp_query_shader(struct radv_device * device)1099 build_timestamp_query_shader(struct radv_device *device)
1100 {
1101 /* the shader this builds is roughly
1102 *
1103 * uint32_t src_stride = 8;
1104 *
1105 * location(binding = 0) buffer dst_buf;
1106 * location(binding = 1) buffer src_buf;
1107 *
1108 * void main() {
1109 * uint64_t result = 0;
1110 * bool available = false;
1111 * uint64_t src_offset = src_stride * global_id.x;
1112 * uint64_t dst_offset = dst_stride * global_id.x;
1113 * uint64_t timestamp = src_buf[src_offset];
1114 * if (timestamp != TIMESTAMP_NOT_READY) {
1115 * result = timestamp;
1116 * available = true;
1117 * }
1118 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
1119 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
1120 * if (flags & VK_QUERY_RESULT_64_BIT) {
1121 * dst_buf[dst_offset] = result;
1122 * } else {
1123 * dst_buf[dst_offset] = (uint32_t)result;
1124 * }
1125 * }
1126 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1127 * dst_buf[dst_offset + result_size] = available;
1128 * }
1129 * }
1130 */
1131 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "timestamp_query");
1132 b.shader->info.workgroup_size[0] = 64;
1133
1134 /* Create and initialize local variables. */
1135 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
1136 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
1137
1138 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
1139 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
1140
1141 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
1142
1143 /* Load resources. */
1144 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
1145 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
1146
1147 /* Compute global ID. */
1148 nir_def *global_id = get_global_ids(&b, 1);
1149
1150 /* Compute src/dst strides. */
1151 nir_def *input_stride = nir_imm_int(&b, 8);
1152 nir_def *input_base = nir_imul(&b, input_stride, global_id);
1153 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
1154 nir_def *output_base = nir_imul(&b, output_stride, global_id);
1155
1156 /* Load data from the query pool. */
1157 nir_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8);
1158
1159 /* Pack the timestamp. */
1160 nir_def *timestamp;
1161 timestamp = nir_pack_64_2x32(&b, nir_trim_vector(&b, load, 2));
1162
1163 /* Check if result is available. */
1164 nir_def *result_is_available = nir_i2b(&b, nir_ine_imm(&b, timestamp, TIMESTAMP_NOT_READY));
1165
1166 /* Only store result if available. */
1167 nir_push_if(&b, result_is_available);
1168
1169 nir_store_var(&b, result, timestamp, 0x1);
1170 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
1171
1172 nir_pop_if(&b, NULL);
1173
1174 /* Determine if result is 64 or 32 bit. */
1175 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
1176 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
1177
1178 /* Store the result if complete or partial results have been requested. */
1179 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
1180
1181 /* Store result. */
1182 nir_push_if(&b, result_is_64bit);
1183
1184 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
1185
1186 nir_push_else(&b, NULL);
1187
1188 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
1189
1190 nir_pop_if(&b, NULL);
1191
1192 nir_pop_if(&b, NULL);
1193
1194 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
1195 nir_b2i32(&b, nir_load_var(&b, available)));
1196
1197 return b.shader;
1198 }
1199
1200 static void
radv_copy_timestamp_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)1201 radv_copy_timestamp_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
1202 uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset,
1203 uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
1204 {
1205 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1206 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1207 uint64_t va = radv_buffer_get_va(pool->bo);
1208
1209 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1210 for (unsigned i = 0; i < query_count; ++i) {
1211 unsigned query = first_query + i;
1212 uint64_t local_src_va = va + query * pool->stride;
1213
1214 radeon_check_space(device->ws, cs, 7);
1215
1216 /* Wait on the high 32 bits of the timestamp in
1217 * case the low part is 0xffffffff.
1218 */
1219 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4, TIMESTAMP_NOT_READY >> 32,
1220 0xffffffff);
1221 }
1222 }
1223
1224 radv_query_shader(cmd_buffer, VK_QUERY_TYPE_TIMESTAMP, pool->bo, dst_bo, first_query * pool->stride, dst_offset,
1225 pool->stride, stride, dst_size, query_count, flags, 0, 0, false);
1226 }
1227
1228 /**
1229 * Primitives generated query
1230 */
1231 #define RADV_PGQ_STRIDE 32
1232 #define RADV_PGQ_STRIDE_EMU (RADV_PGQ_STRIDE + 8 * 2)
1233
1234 static nir_shader *
build_pg_query_shader(struct radv_device * device)1235 build_pg_query_shader(struct radv_device *device)
1236 {
1237 /* the shader this builds is roughly
1238 *
1239 * uint32_t src_stride = 32;
1240 *
1241 * location(binding = 0) buffer dst_buf;
1242 * location(binding = 1) buffer src_buf;
1243 *
1244 * void main() {
1245 * uint64_t result = {};
1246 * bool available = false;
1247 * uint64_t src_offset = src_stride * global_id.x;
1248 * uint64_t dst_offset = dst_stride * global_id.x;
1249 * uint64_t *src_data = src_buf[src_offset];
1250 * uint32_t avail = (src_data[0] >> 32) &
1251 * (src_data[2] >> 32);
1252 * if (avail & 0x80000000) {
1253 * result = src_data[2] - src_data[0];
1254 * if (use_emulated_queries) {
1255 * uint32_t ngg_emu_result = 0;
1256 * ngg_emu_result += src_data[9] - src_data[8];
1257 * result += (uint64_t)ngg_emu_result;
1258 * }
1259 * available = true;
1260 * }
1261 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
1262 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
1263 * if (flags & VK_QUERY_RESULT_64_BIT) {
1264 * dst_buf[dst_offset] = result;
1265 * } else {
1266 * dst_buf[dst_offset] = (uint32_t)result;
1267 * }
1268 * }
1269 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1270 * dst_buf[dst_offset + result_size] = available;
1271 * }
1272 * }
1273 */
1274 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pg_query");
1275 b.shader->info.workgroup_size[0] = 64;
1276
1277 /* Create and initialize local variables. */
1278 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
1279 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
1280
1281 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
1282 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
1283
1284 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
1285
1286 /* Load resources. */
1287 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
1288 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
1289
1290 /* Compute global ID. */
1291 nir_def *global_id = get_global_ids(&b, 1);
1292
1293 /* Determine if the query pool uses emulated queries for NGG. */
1294 nir_def *uses_emulated_queries = nir_i2b(&b, nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20));
1295
1296 /* Compute src/dst strides. */
1297 nir_def *input_stride =
1298 nir_bcsel(&b, uses_emulated_queries, nir_imm_int(&b, RADV_PGQ_STRIDE_EMU), nir_imm_int(&b, RADV_PGQ_STRIDE));
1299 nir_def *input_base = nir_imul(&b, input_stride, global_id);
1300 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
1301 nir_def *output_base = nir_imul(&b, output_stride, global_id);
1302
1303 /* Load data from the query pool. */
1304 nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
1305 nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16);
1306
1307 /* Check if result is available. */
1308 nir_def *avails[2];
1309 avails[0] = nir_channel(&b, load1, 1);
1310 avails[1] = nir_channel(&b, load2, 1);
1311 nir_store_var(&b, available, nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000)), 0x1);
1312
1313 nir_push_if(&b, uses_emulated_queries);
1314 {
1315 nir_def *emu_avail_start = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 36), .align_mul = 4);
1316 nir_def *emu_avail_end = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 44), .align_mul = 4);
1317 nir_def *emu_result_available =
1318 nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, emu_avail_start, emu_avail_end), 0x80000000));
1319
1320 nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), emu_result_available), 0x1);
1321 }
1322 nir_pop_if(&b, NULL);
1323
1324 /* Only compute result if available. */
1325 nir_push_if(&b, nir_load_var(&b, available));
1326
1327 /* Pack values. */
1328 nir_def *packed64[2];
1329 packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
1330 packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
1331
1332 /* Compute result. */
1333 nir_def *primitive_storage_needed = nir_isub(&b, packed64[1], packed64[0]);
1334
1335 nir_store_var(&b, result, primitive_storage_needed, 0x1);
1336
1337 nir_push_if(&b, uses_emulated_queries);
1338 {
1339 nir_def *emu_start =
1340 nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 4);
1341 nir_def *emu_end =
1342 nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 4);
1343
1344 nir_def *ngg_emu_result = nir_isub(&b, emu_end, emu_start);
1345
1346 nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), nir_u2u64(&b, ngg_emu_result)), 0x1);
1347 }
1348 nir_pop_if(&b, NULL);
1349
1350 nir_pop_if(&b, NULL);
1351
1352 /* Determine if result is 64 or 32 bit. */
1353 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
1354 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
1355
1356 /* Store the result if complete or partial results have been requested. */
1357 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
1358
1359 /* Store result. */
1360 nir_push_if(&b, result_is_64bit);
1361
1362 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
1363
1364 nir_push_else(&b, NULL);
1365
1366 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
1367
1368 nir_pop_if(&b, NULL);
1369 nir_pop_if(&b, NULL);
1370
1371 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
1372 nir_b2i32(&b, nir_load_var(&b, available)));
1373
1374 return b.shader;
1375 }
1376
1377 static void
radv_begin_pg_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint32_t index)1378 radv_begin_pg_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, uint32_t index)
1379 {
1380 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1381 const struct radv_physical_device *pdev = radv_device_physical(device);
1382 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1383
1384 if (pdev->info.gfx_level >= GFX11) {
1385 const bool uses_gds = pdev->info.gfx_level < GFX12;
1386
1387 /* On GFX11+, primitives generated query are always emulated. */
1388 gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va);
1389 radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
1390
1391 /* Record that the command buffer needs GDS. */
1392 cmd_buffer->gds_needed |= uses_gds;
1393
1394 if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1395 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1396
1397 cmd_buffer->state.active_emulated_prims_gen_queries++;
1398 } else {
1399 if (!cmd_buffer->state.active_prims_gen_queries) {
1400 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1401
1402 cmd_buffer->state.active_prims_gen_queries++;
1403
1404 if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1405 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
1406 }
1407 } else {
1408 cmd_buffer->state.active_prims_gen_queries++;
1409 }
1410
1411 radv_update_hw_pipelinestat(cmd_buffer);
1412
1413 if (pool->uses_emulated_queries) {
1414 /* generated prim counter */
1415 gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 32);
1416 radv_cs_write_data_imm(cs, V_370_ME, va + 36, 0x80000000);
1417
1418 /* Record that the command buffer needs GDS. */
1419 cmd_buffer->gds_needed = true;
1420
1421 if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1422 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1423
1424 cmd_buffer->state.active_emulated_prims_gen_queries++;
1425 }
1426
1427 emit_sample_streamout(cmd_buffer, va, index);
1428 }
1429 }
1430
1431 static void
radv_end_pg_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint32_t index)1432 radv_end_pg_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, uint32_t index)
1433 {
1434 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1435 const struct radv_physical_device *pdev = radv_device_physical(device);
1436 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1437
1438 if (pdev->info.gfx_level >= GFX11) {
1439 const bool uses_gds = pdev->info.gfx_level < GFX12;
1440
1441 /* On GFX11+, primitives generated query are always emulated. */
1442 gfx10_copy_shader_query_gfx(cmd_buffer, uses_gds, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 16);
1443 radv_cs_write_data_imm(cs, V_370_ME, va + 20, 0x80000000);
1444
1445 cmd_buffer->state.active_emulated_prims_gen_queries--;
1446
1447 if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1448 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1449 } else {
1450 if (cmd_buffer->state.active_prims_gen_queries == 1) {
1451 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1452
1453 cmd_buffer->state.active_prims_gen_queries--;
1454
1455 if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1456 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
1457 }
1458 } else {
1459 cmd_buffer->state.active_prims_gen_queries--;
1460 }
1461
1462 radv_update_hw_pipelinestat(cmd_buffer);
1463
1464 if (pool->uses_emulated_queries) {
1465 /* generated prim counter */
1466 gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 40);
1467 radv_cs_write_data_imm(cs, V_370_ME, va + 44, 0x80000000);
1468
1469 cmd_buffer->state.active_emulated_prims_gen_queries--;
1470
1471 if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1472 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1473 }
1474
1475 emit_sample_streamout(cmd_buffer, va + 16, index);
1476 }
1477 }
1478
1479 static void
radv_copy_pg_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)1480 radv_copy_pg_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
1481 uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset, uint64_t dst_size,
1482 uint64_t stride, VkQueryResultFlags flags)
1483 {
1484 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1485 const struct radv_physical_device *pdev = radv_device_physical(device);
1486 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1487 uint64_t va = radv_buffer_get_va(pool->bo);
1488
1489 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1490 const bool uses_emulated_queries = pool->uses_emulated_queries && pdev->info.gfx_level < GFX11;
1491
1492 for (unsigned i = 0; i < query_count; i++) {
1493 unsigned query = first_query + i;
1494 uint64_t src_va = va + query * pool->stride;
1495
1496 radeon_check_space(device->ws, cs, 7 * 4);
1497
1498 /* Wait on the upper word of the PrimitiveStorageNeeded result. */
1499 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1500 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
1501
1502 if (uses_emulated_queries) {
1503 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 36, 0x80000000, 0xffffffff);
1504 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 44, 0x80000000, 0xffffffff);
1505 }
1506 }
1507 }
1508
1509 radv_query_shader(cmd_buffer, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT, pool->bo, dst_bo, first_query * pool->stride,
1510 dst_offset, pool->stride, stride, dst_size, query_count, flags, 0, 0,
1511 pool->uses_emulated_queries && pdev->info.gfx_level < GFX11);
1512 }
1513
1514 /**
1515 * Mesh primitives generated query
1516 */
1517 static nir_shader *
build_ms_prim_gen_query_shader(struct radv_device * device)1518 build_ms_prim_gen_query_shader(struct radv_device *device)
1519 {
1520 /* the shader this builds is roughly
1521 *
1522 * uint32_t src_stride = 32;
1523 *
1524 * location(binding = 0) buffer dst_buf;
1525 * location(binding = 1) buffer src_buf;
1526 *
1527 * void main() {
1528 * uint64_t result = {};
1529 * bool available = false;
1530 * uint64_t src_offset = src_stride * global_id.x;
1531 * uint64_t dst_offset = dst_stride * global_id.x;
1532 * uint64_t *src_data = src_buf[src_offset];
1533 * uint32_t avail = (src_data[0] >> 32) & (src_data[1] >> 32);
1534 * if (avail & 0x80000000) {
1535 * result = src_data[1] - src_data[0];
1536 * available = true;
1537 * }
1538 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
1539 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
1540 * if (flags & VK_QUERY_RESULT_64_BIT) {
1541 * dst_buf[dst_offset] = result;
1542 * } else {
1543 * dst_buf[dst_offset] = (uint32_t)result;
1544 * }
1545 * }
1546 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1547 * dst_buf[dst_offset + result_size] = available;
1548 * }
1549 * }
1550 */
1551 nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "ms_prim_gen_query");
1552 b.shader->info.workgroup_size[0] = 64;
1553
1554 /* Create and initialize local variables. */
1555 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
1556 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
1557
1558 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
1559 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
1560
1561 nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
1562
1563 /* Load resources. */
1564 nir_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
1565 nir_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
1566
1567 /* Compute global ID. */
1568 nir_def *global_id = get_global_ids(&b, 1);
1569
1570 /* Compute src/dst strides. */
1571 nir_def *input_base = nir_imul_imm(&b, global_id, 16);
1572 nir_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
1573 nir_def *output_base = nir_imul(&b, output_stride, global_id);
1574
1575 /* Load data from the query pool. */
1576 nir_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
1577 nir_def *load2 = nir_load_ssbo(&b, 2, 32, src_buf, nir_iadd_imm(&b, input_base, 8), .align_mul = 16);
1578
1579 /* Check if result is available. */
1580 nir_def *avails[2];
1581 avails[0] = nir_channel(&b, load1, 1);
1582 avails[1] = nir_channel(&b, load2, 1);
1583 nir_def *result_is_available = nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000));
1584
1585 /* Only compute result if available. */
1586 nir_push_if(&b, result_is_available);
1587
1588 /* Pack values. */
1589 nir_def *packed64[2];
1590 packed64[0] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load1, 2));
1591 packed64[1] = nir_pack_64_2x32(&b, nir_trim_vector(&b, load2, 2));
1592
1593 /* Compute result. */
1594 nir_def *ms_prim_gen = nir_isub(&b, packed64[1], packed64[0]);
1595
1596 nir_store_var(&b, result, ms_prim_gen, 0x1);
1597
1598 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
1599
1600 nir_pop_if(&b, NULL);
1601
1602 /* Determine if result is 64 or 32 bit. */
1603 nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
1604 nir_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
1605
1606 /* Store the result if complete or partial results have been requested. */
1607 nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
1608
1609 /* Store result. */
1610 nir_push_if(&b, result_is_64bit);
1611
1612 nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
1613
1614 nir_push_else(&b, NULL);
1615
1616 nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
1617
1618 nir_pop_if(&b, NULL);
1619 nir_pop_if(&b, NULL);
1620
1621 radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
1622 nir_b2i32(&b, nir_load_var(&b, available)));
1623
1624 return b.shader;
1625 }
1626
1627 static void
radv_begin_ms_prim_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va)1628 radv_begin_ms_prim_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
1629 {
1630 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1631 const struct radv_physical_device *pdev = radv_device_physical(device);
1632 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1633
1634 if (pdev->info.gfx_level >= GFX11) {
1635 radeon_check_space(device->ws, cs, 4);
1636
1637 ++cmd_buffer->state.active_pipeline_queries;
1638
1639 radv_update_hw_pipelinestat(cmd_buffer);
1640
1641 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1642 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1643 radeon_emit(cs, va);
1644 radeon_emit(cs, va >> 32);
1645 } else {
1646 gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va);
1647 radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000);
1648
1649 /* Record that the command buffer needs GDS. */
1650 cmd_buffer->gds_needed = true;
1651
1652 if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1653 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1654
1655 cmd_buffer->state.active_emulated_prims_gen_queries++;
1656 }
1657 }
1658
1659 static void
radv_end_ms_prim_query(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint64_t avail_va)1660 radv_end_ms_prim_query(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint64_t avail_va)
1661 {
1662 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1663 const struct radv_physical_device *pdev = radv_device_physical(device);
1664 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1665
1666 if (pdev->info.gfx_level >= GFX11) {
1667 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
1668
1669 radeon_check_space(device->ws, cs, 16);
1670
1671 cmd_buffer->state.active_pipeline_queries--;
1672
1673 radv_update_hw_pipelinestat(cmd_buffer);
1674
1675 va += pipelinestat_block_size;
1676
1677 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1678 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1679 radeon_emit(cs, va);
1680 radeon_emit(cs, va >> 32);
1681
1682 radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
1683 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1, cmd_buffer->gfx9_eop_bug_va);
1684 } else {
1685 gfx10_copy_shader_query_gfx(cmd_buffer, true, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va + 8);
1686 radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000);
1687
1688 cmd_buffer->state.active_emulated_prims_gen_queries--;
1689
1690 if (!cmd_buffer->state.active_emulated_prims_gen_queries)
1691 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
1692 }
1693 }
1694
1695 static void
radv_copy_ms_prim_query_result(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint32_t first_query,uint32_t query_count,struct radeon_winsys_bo * dst_bo,uint64_t dst_offset,uint64_t dst_size,uint64_t stride,VkQueryResultFlags flags)1696 radv_copy_ms_prim_query_result(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint32_t first_query,
1697 uint32_t query_count, struct radeon_winsys_bo *dst_bo, uint64_t dst_offset,
1698 uint64_t dst_size, uint64_t stride, VkQueryResultFlags flags)
1699 {
1700 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1701 const struct radv_physical_device *pdev = radv_device_physical(device);
1702 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1703 uint64_t va = radv_buffer_get_va(pool->bo);
1704
1705 if (pdev->info.gfx_level >= GFX11) {
1706 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1707 for (unsigned i = 0; i < query_count; ++i) {
1708 unsigned query = first_query + i;
1709
1710 radeon_check_space(device->ws, cs, 7);
1711
1712 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1713
1714 /* This waits on the ME. All copies below are done on the ME */
1715 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1716 }
1717 }
1718
1719 radv_query_shader(cmd_buffer, VK_QUERY_TYPE_PIPELINE_STATISTICS, pool->bo, dst_bo, first_query * pool->stride,
1720 dst_offset, pool->stride, stride, dst_size, query_count, flags, 1 << 13,
1721 pool->availability_offset + 4 * first_query, false);
1722 } else {
1723 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1724 for (unsigned i = 0; i < query_count; i++) {
1725 unsigned query = first_query + i;
1726 uint64_t src_va = va + query * pool->stride;
1727
1728 radeon_check_space(device->ws, cs, 7 * 2);
1729
1730 /* Wait on the upper word. */
1731 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1732 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 12, 0x80000000, 0xffffffff);
1733 }
1734 }
1735
1736 radv_query_shader(cmd_buffer, VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT, pool->bo, dst_bo,
1737 first_query * pool->stride, dst_offset, pool->stride, stride, dst_size, query_count, flags, 0,
1738 0, false);
1739 }
1740 }
1741
1742 static VkResult
create_layout(struct radv_device * device,VkPipelineLayout * layout_out)1743 create_layout(struct radv_device *device, VkPipelineLayout *layout_out)
1744 {
1745 enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_QUERY;
1746
1747 const VkDescriptorSetLayoutBinding bindings[] = {
1748 {.binding = 0,
1749 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1750 .descriptorCount = 1,
1751 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT},
1752 {
1753 .binding = 1,
1754 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1755 .descriptorCount = 1,
1756 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1757 },
1758 };
1759
1760 const VkDescriptorSetLayoutCreateInfo desc_info = {
1761 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
1762 .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT,
1763 .bindingCount = 2,
1764 .pBindings = bindings,
1765 };
1766
1767 const VkPushConstantRange pc_range = {
1768 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1769 .size = 20,
1770 };
1771
1772 return vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key, sizeof(key),
1773 layout_out);
1774 }
1775
1776 static VkResult
get_pipeline(struct radv_device * device,VkQueryType query_type,VkPipeline * pipeline_out,VkPipelineLayout * layout_out)1777 get_pipeline(struct radv_device *device, VkQueryType query_type, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
1778 {
1779 enum radv_meta_object_key_type key = 0;
1780 VkResult result;
1781 nir_shader *cs;
1782
1783 result = create_layout(device, layout_out);
1784 if (result != VK_SUCCESS)
1785 return result;
1786
1787 switch (query_type) {
1788 case VK_QUERY_TYPE_OCCLUSION:
1789 key = RADV_META_OBJECT_KEY_QUERY_OCCLUSION;
1790 break;
1791 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1792 key = RADV_META_OBJECT_KEY_QUERY_PIPELINE_STATS;
1793 break;
1794 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1795 key = RADV_META_OBJECT_KEY_QUERY_TFB;
1796 break;
1797 case VK_QUERY_TYPE_TIMESTAMP:
1798 key = RADV_META_OBJECT_KEY_QUERY_TIMESTAMP;
1799 break;
1800 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1801 key = RADV_META_OBJECT_KEY_QUERY_PRIMS_GEN;
1802 break;
1803 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1804 key = RADV_META_OBJECT_KEY_QUERY_MESH_PRIMS_GEN;
1805 break;
1806 default:
1807 unreachable("invalid query type");
1808 }
1809
1810 VkPipeline pipeline_from_cache = vk_meta_lookup_pipeline(&device->meta_state.device, &key, sizeof(key));
1811 if (pipeline_from_cache != VK_NULL_HANDLE) {
1812 *pipeline_out = pipeline_from_cache;
1813 return VK_SUCCESS;
1814 }
1815
1816 switch (query_type) {
1817 case VK_QUERY_TYPE_OCCLUSION:
1818 cs = build_occlusion_query_shader(device);
1819 break;
1820 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1821 cs = build_pipeline_statistics_query_shader(device);
1822 break;
1823 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1824 cs = build_tfb_query_shader(device);
1825 break;
1826 case VK_QUERY_TYPE_TIMESTAMP:
1827 cs = build_timestamp_query_shader(device);
1828 break;
1829 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1830 cs = build_pg_query_shader(device);
1831 break;
1832 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
1833 cs = build_ms_prim_gen_query_shader(device);
1834 break;
1835 default:
1836 unreachable("invalid query type");
1837 }
1838
1839 const VkPipelineShaderStageCreateInfo stage_info = {
1840 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1841 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1842 .module = vk_shader_module_handle_from_nir(cs),
1843 .pName = "main",
1844 .pSpecializationInfo = NULL,
1845 };
1846
1847 const VkComputePipelineCreateInfo pipeline_info = {
1848 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1849 .stage = stage_info,
1850 .flags = 0,
1851 .layout = *layout_out,
1852 };
1853
1854 result = vk_meta_create_compute_pipeline(&device->vk, &device->meta_state.device, &pipeline_info, &key, sizeof(key),
1855 pipeline_out);
1856
1857 ralloc_free(cs);
1858 return result;
1859 }
1860
1861 static void
radv_query_shader(struct radv_cmd_buffer * cmd_buffer,VkQueryType query_type,struct radeon_winsys_bo * src_bo,struct radeon_winsys_bo * dst_bo,uint64_t src_offset,uint64_t dst_offset,uint32_t src_stride,uint32_t dst_stride,size_t dst_size,uint32_t count,uint32_t flags,uint32_t pipeline_stats_mask,uint32_t avail_offset,bool uses_emulated_queries)1862 radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkQueryType query_type, struct radeon_winsys_bo *src_bo,
1863 struct radeon_winsys_bo *dst_bo, uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride,
1864 uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags, uint32_t pipeline_stats_mask,
1865 uint32_t avail_offset, bool uses_emulated_queries)
1866 {
1867 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1868 struct radv_meta_saved_state saved_state;
1869 struct radv_buffer src_buffer, dst_buffer;
1870 VkPipelineLayout layout;
1871 VkPipeline pipeline;
1872 VkResult result;
1873
1874 result = get_pipeline(device, query_type, &pipeline, &layout);
1875 if (result != VK_SUCCESS) {
1876 vk_command_buffer_set_error(&cmd_buffer->vk, result);
1877 return;
1878 }
1879
1880 /* VK_EXT_conditional_rendering says that copy commands should not be
1881 * affected by conditional rendering.
1882 */
1883 radv_meta_save(&saved_state, cmd_buffer,
1884 RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS |
1885 RADV_META_SUSPEND_PREDICATING);
1886
1887 uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset);
1888 uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size;
1889
1890 radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset);
1891 radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset);
1892
1893 radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
1894
1895 radv_meta_push_descriptor_set(
1896 cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, 2,
1897 (VkWriteDescriptorSet[]){{.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1898 .dstBinding = 0,
1899 .dstArrayElement = 0,
1900 .descriptorCount = 1,
1901 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1902 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer),
1903 .offset = 0,
1904 .range = VK_WHOLE_SIZE}},
1905 {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1906 .dstBinding = 1,
1907 .dstArrayElement = 0,
1908 .descriptorCount = 1,
1909 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1910 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer),
1911 .offset = 0,
1912 .range = VK_WHOLE_SIZE}}});
1913
1914 /* Encode the number of elements for easy access by the shader. */
1915 pipeline_stats_mask &= (1 << (radv_get_pipelinestat_query_size(device) / 8)) - 1;
1916 pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1917
1918 avail_offset -= src_offset;
1919
1920 struct {
1921 uint32_t flags;
1922 uint32_t dst_stride;
1923 uint32_t pipeline_stats_mask;
1924 uint32_t avail_offset;
1925 uint32_t uses_emulated_queries;
1926 } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset, uses_emulated_queries};
1927
1928 vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), layout, VK_SHADER_STAGE_COMPUTE_BIT, 0,
1929 sizeof(push_constants), &push_constants);
1930
1931 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1932
1933 if (flags & VK_QUERY_RESULT_WAIT_BIT)
1934 cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1935
1936 radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1937
1938 /* Ensure that the query copy dispatch is complete before a potential vkCmdResetPool because
1939 * there is an implicit execution dependency from each such query command to all query commands
1940 * previously submitted to the same queue.
1941 */
1942 cmd_buffer->active_query_flush_bits |=
1943 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1944
1945 radv_buffer_finish(&src_buffer);
1946 radv_buffer_finish(&dst_buffer);
1947
1948 radv_meta_restore(&saved_state, cmd_buffer);
1949 }
1950
1951 static void
radv_destroy_query_pool(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_query_pool * pool)1952 radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
1953 struct radv_query_pool *pool)
1954 {
1955 if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
1956 radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool);
1957
1958 if (pool->bo)
1959 radv_bo_destroy(device, &pool->vk.base, pool->bo);
1960
1961 radv_rmv_log_resource_destroy(device, (uint64_t)radv_query_pool_to_handle(pool));
1962 vk_query_pool_finish(&pool->vk);
1963 vk_free2(&device->vk.alloc, pAllocator, pool);
1964 }
1965
1966 static VkResult
radv_create_query_pool(struct radv_device * device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)1967 radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo *pCreateInfo,
1968 const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
1969 {
1970 struct radv_physical_device *pdev = radv_device_physical(device);
1971 VkResult result;
1972 size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
1973 ? sizeof(struct radv_pc_query_pool)
1974 : sizeof(struct radv_query_pool);
1975
1976 struct radv_query_pool *pool =
1977 vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1978
1979 if (!pool)
1980 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1981
1982 vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo);
1983
1984 /* The number of primitives generated by geometry shader invocations is only counted by the
1985 * hardware if GS uses the legacy path. When NGG GS is used, the hardware can't know the number
1986 * of generated primitives and we have to increment it from the shader using a plain GDS atomic.
1987 */
1988 pool->uses_emulated_queries =
1989 (pdev->emulate_ngg_gs_query_pipeline_stat &&
1990 (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)) ||
1991 (pdev->use_ngg && pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT) ||
1992 (pdev->emulate_mesh_shader_queries &&
1993 (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT ||
1994 pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT));
1995
1996 /* The number of task shader invocations needs to be queried on ACE. */
1997 pool->uses_ace = (pool->vk.pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
1998
1999 pool->uses_shader_query_buf =
2000 pdev->info.gfx_level >= GFX12 && (pool->vk.query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT ||
2001 pool->vk.query_type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
2002
2003 switch (pCreateInfo->queryType) {
2004 case VK_QUERY_TYPE_OCCLUSION:
2005 pool->stride = 16 * pdev->info.max_render_backends;
2006 break;
2007 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2008 pool->stride = radv_get_pipelinestat_query_size(device) * 2;
2009 if (pool->uses_emulated_queries) {
2010 /* When the query pool uses emulated queries (for counting the number of primitives
2011 * generated by a geometry shader with NGG), allocate 2x64-bit values for begin/end.
2012 */
2013 pool->stride += 8 * 2;
2014 }
2015 break;
2016 case VK_QUERY_TYPE_TIMESTAMP:
2017 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2018 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2019 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2020 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2021 pool->stride = 8;
2022 break;
2023 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2024 pool->stride = 32;
2025 break;
2026 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2027 if (pool->uses_emulated_queries && pdev->info.gfx_level < GFX11) {
2028 /* When the hardware can use both the legacy and the NGG paths in the same begin/end pair,
2029 * allocate 2x64-bit values for the emulated counters.
2030 */
2031 pool->stride = RADV_PGQ_STRIDE_EMU;
2032 } else {
2033 pool->stride = RADV_PGQ_STRIDE;
2034 }
2035 break;
2036 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2037 result = radv_pc_init_query_pool(pdev, pCreateInfo, (struct radv_pc_query_pool *)pool);
2038
2039 if (result != VK_SUCCESS) {
2040 radv_destroy_query_pool(device, pAllocator, pool);
2041 return vk_error(device, result);
2042 }
2043 break;
2044 }
2045 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2046 if (pdev->info.gfx_level >= GFX11) {
2047 /* GFX11 natively supports mesh generated primitives with pipeline statistics. */
2048 pool->stride = radv_get_pipelinestat_query_size(device) * 2;
2049 } else {
2050 assert(pdev->emulate_mesh_shader_queries);
2051 pool->stride = 16;
2052 }
2053 break;
2054 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
2055 pool->stride = 48;
2056 break;
2057 default:
2058 unreachable("creating unhandled query type");
2059 }
2060
2061 pool->availability_offset = pool->stride * pCreateInfo->queryCount;
2062 pool->size = pool->availability_offset;
2063 if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
2064 (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && pdev->info.gfx_level >= GFX11))
2065 pool->size += 4 * pCreateInfo->queryCount;
2066
2067 result = radv_bo_create(device, &pool->vk.base, pool->size, 64, RADEON_DOMAIN_GTT,
2068 RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_QUERY_POOL, 0, false, &pool->bo);
2069 if (result != VK_SUCCESS) {
2070 radv_destroy_query_pool(device, pAllocator, pool);
2071 return vk_error(device, result);
2072 }
2073
2074 pool->ptr = radv_buffer_map(device->ws, pool->bo);
2075 if (!pool->ptr) {
2076 radv_destroy_query_pool(device, pAllocator, pool);
2077 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
2078 }
2079
2080 *pQueryPool = radv_query_pool_to_handle(pool);
2081 radv_rmv_log_query_pool_create(device, *pQueryPool);
2082 return VK_SUCCESS;
2083 }
2084
2085 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)2086 radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
2087 const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
2088 {
2089 VK_FROM_HANDLE(radv_device, device, _device);
2090 return radv_create_query_pool(device, pCreateInfo, pAllocator, pQueryPool);
2091 }
2092
2093 VKAPI_ATTR void VKAPI_CALL
radv_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)2094 radv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator)
2095 {
2096 VK_FROM_HANDLE(radv_device, device, _device);
2097 VK_FROM_HANDLE(radv_query_pool, pool, _pool);
2098
2099 if (!pool)
2100 return;
2101
2102 radv_destroy_query_pool(device, pAllocator, pool);
2103 }
2104
2105 static inline uint64_t
radv_get_rel_timeout_for_query(VkQueryType type)2106 radv_get_rel_timeout_for_query(VkQueryType type)
2107 {
2108 /*
2109 * Certain queries are only possible on certain types of queues
2110 * so pick the TDR timeout of the highest possible type
2111 * and double it to ensure GetQueryPoolResults completes in finite-time.
2112 *
2113 * (compute has longer TDR than gfx, other rings)
2114 */
2115 switch (type) {
2116 case VK_QUERY_TYPE_OCCLUSION:
2117 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2118 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2119 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2120 return radv_get_tdr_timeout_for_ip(AMD_IP_GFX) * 2;
2121 default:
2122 return radv_get_tdr_timeout_for_ip(AMD_IP_COMPUTE) * 2;
2123 }
2124 }
2125
2126 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)2127 radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount,
2128 size_t dataSize, void *pData, VkDeviceSize stride, VkQueryResultFlags flags)
2129 {
2130 VK_FROM_HANDLE(radv_device, device, _device);
2131 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2132 const struct radv_physical_device *pdev = radv_device_physical(device);
2133 char *data = pData;
2134 VkResult result = VK_SUCCESS;
2135
2136 if (vk_device_is_lost(&device->vk))
2137 return VK_ERROR_DEVICE_LOST;
2138
2139 for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) {
2140 char *dest = data;
2141 unsigned query = firstQuery + query_idx;
2142 char *src = pool->ptr + query * pool->stride;
2143 uint32_t available;
2144 bool timed_out = false;
2145 uint64_t atimeout = os_time_get_absolute_timeout(radv_get_rel_timeout_for_query(pool->vk.query_type));
2146
2147 switch (pool->vk.query_type) {
2148 case VK_QUERY_TYPE_TIMESTAMP:
2149 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2150 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2151 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2152 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: {
2153 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2154 uint64_t value;
2155
2156 do {
2157 value = p_atomic_read(&src64->value);
2158 } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
2159 !(timed_out = (atimeout < os_time_get_nano())));
2160
2161 available = value != TIMESTAMP_NOT_READY;
2162
2163 if (timed_out)
2164 result = VK_ERROR_DEVICE_LOST;
2165 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2166 result = VK_NOT_READY;
2167
2168 if (flags & VK_QUERY_RESULT_64_BIT) {
2169 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2170 *(uint64_t *)dest = value;
2171 dest += 8;
2172 } else {
2173 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2174 *(uint32_t *)dest = (uint32_t)value;
2175 dest += 4;
2176 }
2177 break;
2178 }
2179 case VK_QUERY_TYPE_OCCLUSION: {
2180 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2181 uint32_t db_count = pdev->info.max_render_backends;
2182 uint64_t enabled_rb_mask = pdev->info.enabled_rb_mask;
2183 uint64_t sample_count = 0;
2184 available = 1;
2185
2186 for (int i = 0; i < db_count; ++i) {
2187 uint64_t start, end;
2188
2189 if (!(enabled_rb_mask & (1ull << i)))
2190 continue;
2191
2192 do {
2193 start = p_atomic_read(&src64[2 * i].value);
2194 end = p_atomic_read(&src64[2 * i + 1].value);
2195 } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
2196 !(timed_out = (atimeout < os_time_get_nano())));
2197
2198 if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
2199 available = 0;
2200 else {
2201 sample_count += end - start;
2202 }
2203 }
2204
2205 if (timed_out)
2206 result = VK_ERROR_DEVICE_LOST;
2207 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2208 result = VK_NOT_READY;
2209
2210 if (flags & VK_QUERY_RESULT_64_BIT) {
2211 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2212 *(uint64_t *)dest = sample_count;
2213 dest += 8;
2214 } else {
2215 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2216 *(uint32_t *)dest = sample_count;
2217 dest += 4;
2218 }
2219 break;
2220 }
2221 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
2222 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
2223 const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
2224 uint64_t ngg_emu_result = 0;
2225
2226 do {
2227 available = p_atomic_read(avail_ptr);
2228
2229 if (pool->uses_ace && pdev->emulate_mesh_shader_queries) {
2230 const uint32_t task_invoc_offset =
2231 radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
2232 const uint32_t *avail_ptr_start = (const uint32_t *)(src + task_invoc_offset + 4);
2233 const uint32_t *avail_ptr_stop =
2234 (const uint32_t *)(src + pipelinestat_block_size + task_invoc_offset + 4);
2235
2236 if (!(p_atomic_read(avail_ptr_start) & 0x80000000) || !(p_atomic_read(avail_ptr_stop) & 0x80000000))
2237 available = 0;
2238 }
2239 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
2240
2241 if (timed_out)
2242 result = VK_ERROR_DEVICE_LOST;
2243 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2244 result = VK_NOT_READY;
2245
2246 if (pool->uses_emulated_queries) {
2247 /* Compute the result that was copied from emu. */
2248 const uint64_t *emu_start = (uint64_t *)(src + pipelinestat_block_size * 2);
2249 const uint64_t *emu_stop = (uint64_t *)(src + pipelinestat_block_size * 2 + 8);
2250
2251 ngg_emu_result = emu_stop[0] - emu_start[0];
2252 }
2253
2254 const uint64_t *start = (uint64_t *)src;
2255 const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
2256 if (flags & VK_QUERY_RESULT_64_BIT) {
2257 uint64_t *dst = (uint64_t *)dest;
2258 dest += util_bitcount(pool->vk.pipeline_statistics) * 8;
2259 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
2260 if (pool->vk.pipeline_statistics & (1u << i)) {
2261 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
2262 *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
2263
2264 if (pool->uses_emulated_queries &&
2265 (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)
2266 *dst += ngg_emu_result;
2267 }
2268 dst++;
2269 }
2270 }
2271
2272 } else {
2273 uint32_t *dst = (uint32_t *)dest;
2274 dest += util_bitcount(pool->vk.pipeline_statistics) * 4;
2275 for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
2276 if (pool->vk.pipeline_statistics & (1u << i)) {
2277 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
2278 *dst = stop[pipeline_statistics_indices[i]] - start[pipeline_statistics_indices[i]];
2279
2280 if (pool->uses_emulated_queries &&
2281 (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT)
2282 *dst += ngg_emu_result;
2283 }
2284 dst++;
2285 }
2286 }
2287 }
2288 break;
2289 }
2290 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
2291 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2292 uint64_t num_primitives_written;
2293 uint64_t primitive_storage_needed;
2294
2295 /* SAMPLE_STREAMOUTSTATS stores this structure:
2296 * {
2297 * u64 NumPrimitivesWritten;
2298 * u64 PrimitiveStorageNeeded;
2299 * }
2300 */
2301 do {
2302 available = 1;
2303 for (int j = 0; j < 4; j++) {
2304 if (!(p_atomic_read(&src64[j].value) & 0x8000000000000000UL))
2305 available = 0;
2306 }
2307 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
2308
2309 if (timed_out)
2310 result = VK_ERROR_DEVICE_LOST;
2311 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2312 result = VK_NOT_READY;
2313
2314 num_primitives_written = p_atomic_read_relaxed(&src64[3].value) - p_atomic_read_relaxed(&src64[1].value);
2315 primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
2316
2317 if (flags & VK_QUERY_RESULT_64_BIT) {
2318 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2319 *(uint64_t *)dest = num_primitives_written;
2320 dest += 8;
2321 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2322 *(uint64_t *)dest = primitive_storage_needed;
2323 dest += 8;
2324 } else {
2325 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2326 *(uint32_t *)dest = num_primitives_written;
2327 dest += 4;
2328 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2329 *(uint32_t *)dest = primitive_storage_needed;
2330 dest += 4;
2331 }
2332 break;
2333 }
2334 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
2335 const bool uses_emulated_queries = pool->uses_emulated_queries && pdev->info.gfx_level < GFX11;
2336 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2337 uint64_t primitive_storage_needed;
2338
2339 /* SAMPLE_STREAMOUTSTATS stores this structure:
2340 * {
2341 * u64 NumPrimitivesWritten;
2342 * u64 PrimitiveStorageNeeded;
2343 * }
2344 */
2345 do {
2346 available = 1;
2347 if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
2348 !(p_atomic_read(&src64[2].value) & 0x8000000000000000UL)) {
2349 available = 0;
2350 }
2351 if (uses_emulated_queries && (!(p_atomic_read(&src64[4].value) & 0x8000000000000000UL) ||
2352 !(p_atomic_read(&src64[5].value) & 0x8000000000000000UL))) {
2353 available = 0;
2354 }
2355 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
2356
2357 if (timed_out)
2358 result = VK_ERROR_DEVICE_LOST;
2359 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2360 result = VK_NOT_READY;
2361
2362 primitive_storage_needed = p_atomic_read_relaxed(&src64[2].value) - p_atomic_read_relaxed(&src64[0].value);
2363
2364 if (uses_emulated_queries) {
2365 /* Accumulate the result that was copied from the emulated queries in case NGG shader has been used. */
2366 primitive_storage_needed += p_atomic_read_relaxed(&src64[5].value) - p_atomic_read_relaxed(&src64[4].value);
2367 }
2368
2369 if (flags & VK_QUERY_RESULT_64_BIT) {
2370 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2371 *(uint64_t *)dest = primitive_storage_needed;
2372 dest += 8;
2373 } else {
2374 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2375 *(uint32_t *)dest = primitive_storage_needed;
2376 dest += 4;
2377 }
2378 break;
2379 }
2380 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
2381 struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool;
2382 const p_atomic_uint64_t *src64 = (const p_atomic_uint64_t *)src;
2383 bool avail;
2384 do {
2385 avail = true;
2386 for (unsigned i = 0; i < pc_pool->num_passes; ++i)
2387 if (!p_atomic_read(&src64[pool->stride / 8 - i - 1].value))
2388 avail = false;
2389 } while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano())));
2390
2391 available = avail;
2392
2393 radv_pc_get_results(pc_pool, &src64->value, dest);
2394 dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR);
2395 break;
2396 }
2397 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: {
2398 uint64_t ms_prim_gen;
2399
2400 if (pdev->info.gfx_level >= GFX11) {
2401 unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device);
2402 const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
2403
2404 do {
2405 available = p_atomic_read(avail_ptr);
2406 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
2407 !(timed_out = (atimeout < os_time_get_nano())));
2408
2409 const uint64_t *start = (uint64_t *)src;
2410 const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
2411
2412 ms_prim_gen = stop[pipeline_statistics_indices[13]] - start[pipeline_statistics_indices[13]];
2413 } else {
2414 p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
2415
2416 do {
2417 available = 1;
2418 if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) ||
2419 !(p_atomic_read(&src64[1].value) & 0x8000000000000000UL)) {
2420 available = 0;
2421 }
2422 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) &&
2423 !(timed_out = (atimeout < os_time_get_nano())));
2424
2425 ms_prim_gen = p_atomic_read_relaxed(&src64[1].value) - p_atomic_read_relaxed(&src64[0].value);
2426 }
2427
2428 if (timed_out)
2429 result = VK_ERROR_DEVICE_LOST;
2430 else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2431 result = VK_NOT_READY;
2432
2433 if (flags & VK_QUERY_RESULT_64_BIT) {
2434 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2435 *(uint64_t *)dest = ms_prim_gen;
2436 dest += 8;
2437 } else {
2438 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
2439 *(uint32_t *)dest = ms_prim_gen;
2440 dest += 4;
2441 }
2442 break;
2443 }
2444 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR: {
2445 uint32_t *src32 = (uint32_t *)src;
2446 uint32_t value;
2447 do {
2448 value = p_atomic_read(&src32[1]);
2449 } while (value != 1 && (flags & VK_QUERY_RESULT_WAIT_BIT));
2450
2451 available = value != 0;
2452
2453 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
2454 result = VK_NOT_READY;
2455
2456 if (flags & VK_QUERY_RESULT_64_BIT) {
2457 uint64_t *dest64 = (uint64_t *)dest;
2458 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
2459 dest64[0] = src32[5];
2460 dest64[1] = src32[6];
2461 }
2462 dest += 16;
2463 if (flags & VK_QUERY_RESULT_WITH_STATUS_BIT_KHR) {
2464 dest64[2] = 1;
2465 dest += 8;
2466 }
2467 } else {
2468 uint32_t *dest32 = (uint32_t *)dest;
2469 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
2470 dest32[0] = src32[5];
2471 dest32[1] = src32[6];
2472 }
2473 dest += 8;
2474 if (flags & VK_QUERY_RESULT_WITH_STATUS_BIT_KHR) {
2475 dest32[2] = 1;
2476 dest += 4;
2477 }
2478 }
2479 break;
2480 }
2481 default:
2482 unreachable("trying to get results of unhandled query type");
2483 }
2484
2485 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
2486 if (flags & VK_QUERY_RESULT_64_BIT) {
2487 *(uint64_t *)dest = available;
2488 } else {
2489 *(uint32_t *)dest = available;
2490 }
2491 }
2492 }
2493
2494 if (result == VK_ERROR_DEVICE_LOST)
2495 vk_device_set_lost(&device->vk, "GetQueryPoolResults timed out");
2496
2497 return result;
2498 }
2499
2500 static void
emit_query_flush(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool)2501 emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool)
2502 {
2503 if (cmd_buffer->pending_reset_query) {
2504 if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
2505 /* Only need to flush caches if the query pool size is
2506 * large enough to be reset using the compute shader
2507 * path. Small pools don't need any cache flushes
2508 * because we use a CP dma clear.
2509 */
2510 radv_emit_cache_flush(cmd_buffer);
2511 }
2512 }
2513 }
2514
2515 VKAPI_ATTR void VKAPI_CALL
radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)2516 radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
2517 uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride,
2518 VkQueryResultFlags flags)
2519 {
2520 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2521 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2522 VK_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
2523 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2524 const struct radv_physical_device *pdev = radv_device_physical(device);
2525 const struct radv_instance *instance = radv_physical_device_instance(pdev);
2526 const uint64_t dst_offset = dst_buffer->offset + dstOffset;
2527 const size_t dst_size = radv_query_result_size(pool, flags);
2528
2529 if (!queryCount)
2530 return;
2531
2532 radv_cs_add_buffer(device->ws, cmd_buffer->cs, pool->bo);
2533 radv_cs_add_buffer(device->ws, cmd_buffer->cs, dst_buffer->bo);
2534
2535 /* Workaround engines that forget to properly specify WAIT_BIT because some driver implicitly
2536 * synchronizes before query copy.
2537 */
2538 if (instance->drirc.flush_before_query_copy)
2539 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
2540
2541 /* From the Vulkan spec 1.1.108:
2542 *
2543 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
2544 * previous uses of vkCmdResetQueryPool in the same queue, without any
2545 * additional synchronization."
2546 *
2547 * So, we have to flush the caches if the compute shader path was used.
2548 */
2549 emit_query_flush(cmd_buffer, pool);
2550
2551 switch (pool->vk.query_type) {
2552 case VK_QUERY_TYPE_OCCLUSION:
2553 radv_copy_occlusion_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size,
2554 stride, flags);
2555 break;
2556 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2557 radv_copy_pipeline_stat_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset,
2558 dst_size, stride, flags);
2559 break;
2560 case VK_QUERY_TYPE_TIMESTAMP:
2561 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2562 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2563 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2564 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2565 radv_copy_timestamp_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size,
2566 stride, flags);
2567 break;
2568 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2569 radv_copy_tfb_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size, stride,
2570 flags);
2571 break;
2572 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2573 radv_copy_pg_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size, stride,
2574 flags);
2575 break;
2576 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2577 radv_copy_ms_prim_query_result(cmd_buffer, pool, firstQuery, queryCount, dst_buffer->bo, dst_offset, dst_size,
2578 stride, flags);
2579 break;
2580 default:
2581 unreachable("trying to get results of unhandled query type");
2582 }
2583 }
2584
2585 static uint32_t
query_clear_value(VkQueryType type)2586 query_clear_value(VkQueryType type)
2587 {
2588 switch (type) {
2589 case VK_QUERY_TYPE_TIMESTAMP:
2590 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2591 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2592 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2593 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2594 return (uint32_t)TIMESTAMP_NOT_READY;
2595 default:
2596 return 0;
2597 }
2598 }
2599
2600 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)2601 radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
2602 {
2603 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2604 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2605 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2606 const struct radv_physical_device *pdev = radv_device_physical(device);
2607 uint32_t value = query_clear_value(pool->vk.query_type);
2608 uint32_t flush_bits = 0;
2609
2610 if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC || cmd_buffer->qf == RADV_QUEUE_VIDEO_ENC)
2611 /* video queries don't work like this */
2612 return;
2613
2614 /* Make sure to sync all previous work if the given command buffer has
2615 * pending active queries. Otherwise the GPU might write queries data
2616 * after the reset operation.
2617 */
2618 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
2619
2620 flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, radv_buffer_get_va(pool->bo) + firstQuery * pool->stride,
2621 queryCount * pool->stride, value);
2622
2623 if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
2624 (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && pdev->info.gfx_level >= GFX11)) {
2625 flush_bits |=
2626 radv_fill_buffer(cmd_buffer, NULL, pool->bo,
2627 radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4, queryCount * 4, 0);
2628 }
2629
2630 if (flush_bits) {
2631 /* Only need to flush caches for the compute shader path. */
2632 cmd_buffer->pending_reset_query = true;
2633 cmd_buffer->state.flush_bits |= flush_bits;
2634 }
2635 }
2636
2637 VKAPI_ATTR void VKAPI_CALL
radv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)2638 radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
2639 {
2640 VK_FROM_HANDLE(radv_device, device, _device);
2641 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2642 const struct radv_physical_device *pdev = radv_device_physical(device);
2643
2644 uint32_t value = query_clear_value(pool->vk.query_type);
2645 uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride);
2646 uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride);
2647
2648 for (uint32_t *p = data; p != data_end; ++p)
2649 *p = value;
2650
2651 if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
2652 (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && pdev->info.gfx_level >= GFX11)) {
2653 memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4);
2654 }
2655 }
2656
2657 static void
emit_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,VkQueryType query_type,VkQueryControlFlags flags,uint32_t index)2658 emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, VkQueryType query_type,
2659 VkQueryControlFlags flags, uint32_t index)
2660 {
2661 switch (query_type) {
2662 case VK_QUERY_TYPE_OCCLUSION:
2663 radv_begin_occlusion_query(cmd_buffer, va, flags);
2664 break;
2665 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2666 radv_begin_pipeline_stat_query(cmd_buffer, pool, va);
2667 break;
2668 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2669 radv_begin_tfb_query(cmd_buffer, va, index);
2670 break;
2671 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2672 radv_begin_pg_query(cmd_buffer, pool, va, index);
2673 break;
2674 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
2675 radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2676 break;
2677 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2678 radv_begin_ms_prim_query(cmd_buffer, va);
2679 break;
2680 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
2681 cmd_buffer->video.feedback_query_va = va;
2682 break;
2683 default:
2684 unreachable("beginning unhandled query type");
2685 }
2686 }
2687
2688 static void
emit_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va,VkQueryType query_type,uint32_t index)2689 emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va, uint64_t avail_va,
2690 VkQueryType query_type, uint32_t index)
2691 {
2692 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2693 const struct radv_physical_device *pdev = radv_device_physical(device);
2694
2695 switch (query_type) {
2696 case VK_QUERY_TYPE_OCCLUSION:
2697 radv_end_occlusion_query(cmd_buffer, va);
2698 break;
2699 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2700 radv_end_pipeline_stat_query(cmd_buffer, pool, va, avail_va);
2701 break;
2702 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
2703 radv_end_tfb_query(cmd_buffer, va, index);
2704 break;
2705 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
2706 radv_end_pg_query(cmd_buffer, pool, va, index);
2707 break;
2708 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
2709 radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
2710 break;
2711 case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT:
2712 radv_end_ms_prim_query(cmd_buffer, va, avail_va);
2713 break;
2714 case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR:
2715 cmd_buffer->video.feedback_query_va = 0;
2716 break;
2717 default:
2718 unreachable("ending unhandled query type");
2719 }
2720
2721 cmd_buffer->active_query_flush_bits |=
2722 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2723 if (pdev->info.gfx_level >= GFX9) {
2724 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2725 }
2726 }
2727
2728 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)2729 radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
2730 VkQueryControlFlags flags, uint32_t index)
2731 {
2732 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2733 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2734 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2735 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2736 uint64_t va = radv_buffer_get_va(pool->bo);
2737
2738 radv_cs_add_buffer(device->ws, cs, pool->bo);
2739
2740 emit_query_flush(cmd_buffer, pool);
2741
2742 va += pool->stride * query;
2743
2744 if (pool->uses_ace) {
2745 if (!radv_gang_init(cmd_buffer))
2746 return;
2747
2748 radv_cs_add_buffer(device->ws, cmd_buffer->gang.cs, pool->bo);
2749 }
2750
2751 if (pool->uses_shader_query_buf)
2752 radv_alloc_shader_query_buf(cmd_buffer);
2753
2754 emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, flags, index);
2755 }
2756
2757 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)2758 radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, uint32_t index)
2759 {
2760 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2761 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2762 uint64_t va = radv_buffer_get_va(pool->bo);
2763 uint64_t avail_va = va + pool->availability_offset + 4 * query;
2764 va += pool->stride * query;
2765
2766 /* Do not need to add the pool BO to the list because the query must
2767 * currently be active, which means the BO is already in the list.
2768 */
2769 emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, index);
2770
2771 /*
2772 * For multiview we have to emit a query for each bit in the mask,
2773 * however the first query we emit will get the totals for all the
2774 * operations, so we don't want to get a real value in the other
2775 * queries. This emits a fake begin/end sequence so the waiting
2776 * code gets a completed query value and doesn't hang, but the
2777 * query returns 0.
2778 */
2779 if (cmd_buffer->state.render.view_mask) {
2780 for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.render.view_mask); i++) {
2781 va += pool->stride;
2782 avail_va += 4;
2783 emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, 0, 0);
2784 emit_end_query(cmd_buffer, pool, va, avail_va, pool->vk.query_type, 0);
2785 }
2786 }
2787 }
2788
2789 void
radv_write_timestamp(struct radv_cmd_buffer * cmd_buffer,uint64_t va,VkPipelineStageFlags2 stage)2790 radv_write_timestamp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkPipelineStageFlags2 stage)
2791 {
2792 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2793 const struct radv_physical_device *pdev = radv_device_physical(device);
2794 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2795
2796 if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
2797 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2798 radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM | COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
2799 COPY_DATA_DST_SEL(V_370_MEM));
2800 radeon_emit(cs, 0);
2801 radeon_emit(cs, 0);
2802 radeon_emit(cs, va);
2803 radeon_emit(cs, va >> 32);
2804 } else {
2805 radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
2806 EOP_DST_SEL_MEM, EOP_DATA_SEL_TIMESTAMP, va, 0, cmd_buffer->gfx9_eop_bug_va);
2807 }
2808 }
2809
2810 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)2811 radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkQueryPool queryPool,
2812 uint32_t query)
2813 {
2814 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2815 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2816 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2817 const struct radv_physical_device *pdev = radv_device_physical(device);
2818 const struct radv_instance *instance = radv_physical_device_instance(pdev);
2819 const unsigned num_queries = MAX2(util_bitcount(cmd_buffer->state.render.view_mask), 1);
2820 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2821 const uint64_t va = radv_buffer_get_va(pool->bo);
2822 uint64_t query_va = va + pool->stride * query;
2823
2824 radv_cs_add_buffer(device->ws, cs, pool->bo);
2825
2826 assert(cmd_buffer->qf != RADV_QUEUE_VIDEO_DEC &&
2827 cmd_buffer->qf != RADV_QUEUE_VIDEO_ENC);
2828
2829 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
2830 if (instance->drirc.flush_before_timestamp_write) {
2831 radeon_check_space(device->ws, cmd_buffer->cs, 1);
2832 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
2833 }
2834
2835 for (unsigned i = 0; i < num_queries; ++i, query_va += pool->stride) {
2836 radeon_check_space(device->ws, cmd_buffer->cs, 3);
2837 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
2838 radeon_emit(cs, query_va);
2839 radeon_emit(cs, query_va >> 32);
2840 }
2841 return;
2842 }
2843
2844 if (instance->drirc.flush_before_timestamp_write) {
2845 /* Make sure previously launched waves have finished */
2846 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
2847 }
2848
2849 radv_emit_cache_flush(cmd_buffer);
2850
2851 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 28 * num_queries);
2852
2853 for (unsigned i = 0; i < num_queries; i++) {
2854 radv_write_timestamp(cmd_buffer, query_va, stage);
2855 query_va += pool->stride;
2856 }
2857
2858 cmd_buffer->active_query_flush_bits |=
2859 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
2860 if (pdev->info.gfx_level >= GFX9) {
2861 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2862 }
2863
2864 assert(cmd_buffer->cs->cdw <= cdw_max);
2865 }
2866
2867 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,uint32_t accelerationStructureCount,const VkAccelerationStructureKHR * pAccelerationStructures,VkQueryType queryType,VkQueryPool queryPool,uint32_t firstQuery)2868 radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount,
2869 const VkAccelerationStructureKHR *pAccelerationStructures,
2870 VkQueryType queryType, VkQueryPool queryPool, uint32_t firstQuery)
2871 {
2872 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2873 VK_FROM_HANDLE(radv_query_pool, pool, queryPool);
2874 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2875 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2876 uint64_t pool_va = radv_buffer_get_va(pool->bo);
2877 uint64_t query_va = pool_va + pool->stride * firstQuery;
2878
2879 radv_cs_add_buffer(device->ws, cs, pool->bo);
2880
2881 radv_emit_cache_flush(cmd_buffer);
2882
2883 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 6 * accelerationStructureCount);
2884
2885 for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
2886 VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pAccelerationStructures[i]);
2887 uint64_t va = vk_acceleration_structure_get_va(accel_struct);
2888
2889 switch (queryType) {
2890 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2891 va += offsetof(struct radv_accel_struct_header, compacted_size);
2892 break;
2893 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2894 va += offsetof(struct radv_accel_struct_header, serialization_size);
2895 break;
2896 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2897 va += offsetof(struct radv_accel_struct_header, instance_count);
2898 break;
2899 case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2900 va += offsetof(struct radv_accel_struct_header, size);
2901 break;
2902 default:
2903 unreachable("Unhandle accel struct query type.");
2904 }
2905
2906 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2907 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
2908 COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
2909 radeon_emit(cs, va);
2910 radeon_emit(cs, va >> 32);
2911 radeon_emit(cs, query_va);
2912 radeon_emit(cs, query_va >> 32);
2913
2914 query_va += pool->stride;
2915 }
2916
2917 assert(cmd_buffer->cs->cdw <= cdw_max);
2918 }
2919