• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * Based on anv:
4  * Copyright © 2015 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23  * IN THE SOFTWARE.
24  */
25 
26 #include <assert.h>
27 #include <fcntl.h>
28 #include <stdbool.h>
29 #include <string.h>
30 
31 #include "nir/nir_builder.h"
32 #include "util/u_atomic.h"
33 #include "vulkan/vulkan_core.h"
34 #include "radv_acceleration_structure.h"
35 #include "radv_cs.h"
36 #include "radv_meta.h"
37 #include "radv_private.h"
38 #include "sid.h"
39 
40 #define TIMESTAMP_NOT_READY UINT64_MAX
41 
42 static const int pipelinestat_block_size = 11 * 8;
43 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
44 
45 static void
radv_store_availability(nir_builder * b,nir_ssa_def * flags,nir_ssa_def * dst_buf,nir_ssa_def * offset,nir_ssa_def * value32)46 radv_store_availability(nir_builder *b, nir_ssa_def *flags, nir_ssa_def *dst_buf,
47                         nir_ssa_def *offset, nir_ssa_def *value32)
48 {
49    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
50 
51    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
52 
53    nir_store_ssbo(b, nir_vec2(b, value32, nir_imm_int(b, 0)), dst_buf, offset, .align_mul = 8);
54 
55    nir_push_else(b, NULL);
56 
57    nir_store_ssbo(b, value32, dst_buf, offset);
58 
59    nir_pop_if(b, NULL);
60 
61    nir_pop_if(b, NULL);
62 }
63 
64 static nir_shader *
build_occlusion_query_shader(struct radv_device * device)65 build_occlusion_query_shader(struct radv_device *device)
66 {
67    /* the shader this builds is roughly
68     *
69     * push constants {
70     * 	uint32_t flags;
71     * 	uint32_t dst_stride;
72     * };
73     *
74     * uint32_t src_stride = 16 * db_count;
75     *
76     * location(binding = 0) buffer dst_buf;
77     * location(binding = 1) buffer src_buf;
78     *
79     * void main() {
80     * 	uint64_t result = 0;
81     * 	uint64_t src_offset = src_stride * global_id.x;
82     * 	uint64_t dst_offset = dst_stride * global_id.x;
83     * 	bool available = true;
84     * 	for (int i = 0; i < db_count; ++i) {
85     *		if (enabled_rb_mask & (1 << i)) {
86     *			uint64_t start = src_buf[src_offset + 16 * i];
87     *			uint64_t end = src_buf[src_offset + 16 * i + 8];
88     *			if ((start & (1ull << 63)) && (end & (1ull << 63)))
89     *				result += end - start;
90     *			else
91     *				available = false;
92     *		}
93     * 	}
94     * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
95     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
96     * 		if (flags & VK_QUERY_RESULT_64_BIT)
97     * 			dst_buf[dst_offset] = result;
98     * 		else
99     * 			dst_buf[dst_offset] = (uint32_t)result.
100     * 	}
101     * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
102     * 		dst_buf[dst_offset + elem_size] = available;
103     * 	}
104     * }
105     */
106    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "occlusion_query");
107    b.shader->info.workgroup_size[0] = 64;
108 
109    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
110    nir_variable *outer_counter =
111       nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
112    nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
113    nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
114    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
115    unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
116    unsigned db_count = device->physical_device->rad_info.max_render_backends;
117 
118    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
119 
120    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
121    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
122 
123    nir_ssa_def *global_id = get_global_ids(&b, 1);
124 
125    nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
126    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
127    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
128    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
129 
130    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
131    nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
132    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
133 
134    nir_push_loop(&b);
135 
136    nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
137    radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
138 
139    nir_ssa_def *enabled_cond =
140       nir_iand_imm(&b, nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count), enabled_rb_mask);
141 
142    nir_push_if(&b, nir_i2b(&b, enabled_cond));
143 
144    nir_ssa_def *load_offset = nir_imul_imm(&b, current_outer_count, 16);
145    load_offset = nir_iadd(&b, input_base, load_offset);
146 
147    nir_ssa_def *load = nir_load_ssbo(&b, 2, 64, src_buf, load_offset, .align_mul = 16);
148 
149    nir_store_var(&b, start, nir_channel(&b, load, 0), 0x1);
150    nir_store_var(&b, end, nir_channel(&b, load, 1), 0x1);
151 
152    nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
153    nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
154 
155    nir_push_if(&b, nir_iand(&b, start_done, end_done));
156 
157    nir_store_var(&b, result,
158                  nir_iadd(&b, nir_load_var(&b, result),
159                           nir_isub(&b, nir_load_var(&b, end), nir_load_var(&b, start))),
160                  0x1);
161 
162    nir_push_else(&b, NULL);
163 
164    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
165 
166    nir_pop_if(&b, NULL);
167    nir_pop_if(&b, NULL);
168    nir_pop_loop(&b, NULL);
169 
170    /* Store the result if complete or if partial results have been requested. */
171 
172    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
173    nir_ssa_def *result_size =
174       nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
175    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
176                            nir_load_var(&b, available)));
177 
178    nir_push_if(&b, result_is_64bit);
179 
180    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base, .align_mul = 8);
181 
182    nir_push_else(&b, NULL);
183 
184    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base,
185                   .align_mul = 8);
186 
187    nir_pop_if(&b, NULL);
188    nir_pop_if(&b, NULL);
189 
190    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
191                            nir_b2i32(&b, nir_load_var(&b, available)));
192 
193    return b.shader;
194 }
195 
196 static nir_shader *
build_pipeline_statistics_query_shader(struct radv_device * device)197 build_pipeline_statistics_query_shader(struct radv_device *device)
198 {
199    /* the shader this builds is roughly
200     *
201     * push constants {
202     * 	uint32_t flags;
203     * 	uint32_t dst_stride;
204     * 	uint32_t stats_mask;
205     * 	uint32_t avail_offset;
206     * };
207     *
208     * uint32_t src_stride = pipelinestat_block_size * 2;
209     *
210     * location(binding = 0) buffer dst_buf;
211     * location(binding = 1) buffer src_buf;
212     *
213     * void main() {
214     * 	uint64_t src_offset = src_stride * global_id.x;
215     * 	uint64_t dst_base = dst_stride * global_id.x;
216     * 	uint64_t dst_offset = dst_base;
217     * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
218     * 	uint32_t elem_count = stats_mask >> 16;
219     * 	uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
220     * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
221     * 		dst_buf[dst_offset + elem_count * elem_size] = available32;
222     * 	}
223     * 	if ((bool)available32) {
224     * 		// repeat 11 times:
225     * 		if (stats_mask & (1 << 0)) {
226     * 			uint64_t start = src_buf[src_offset + 8 * indices[0]];
227     * 			uint64_t end = src_buf[src_offset + 8 * indices[0] +
228     * pipelinestat_block_size]; uint64_t result = end - start; if (flags & VK_QUERY_RESULT_64_BIT)
229     * 				dst_buf[dst_offset] = result;
230     * 			else
231     * 				dst_buf[dst_offset] = (uint32_t)result.
232     * 			dst_offset += elem_size;
233     * 		}
234     * 	} else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
235     *              // Set everything to 0 as we don't know what is valid.
236     * 		for (int i = 0; i < elem_count; ++i)
237     * 			dst_buf[dst_base + elem_size * i] = 0;
238     * 	}
239     * }
240     */
241    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pipeline_statistics_query");
242    b.shader->info.workgroup_size[0] = 64;
243 
244    nir_variable *output_offset =
245       nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
246    nir_variable *result =
247       nir_local_variable_create(b.impl, glsl_int64_t_type(), "result");
248 
249    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
250    nir_ssa_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 12);
251    nir_ssa_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
252    nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
253 
254    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
255    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
256 
257    nir_ssa_def *global_id = get_global_ids(&b, 1);
258 
259    nir_variable *input_stride = nir_local_variable_create(b.impl, glsl_int_type(), "input_stride");
260    nir_push_if(&b, nir_ine(&b, uses_gds, nir_imm_int(&b, 0)));
261    {
262       nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2 + 8 * 2), 0x1);
263    }
264    nir_push_else(&b, NULL);
265    {
266       nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size * 2), 0x1);
267    }
268    nir_pop_if(&b, NULL);
269 
270    nir_ssa_def *input_base = nir_imul(&b, nir_load_var(&b, input_stride), global_id);
271    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
272    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
273 
274    avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4));
275 
276    nir_ssa_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset);
277 
278    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
279    nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
280    nir_ssa_def *elem_count = nir_ushr_imm(&b, stats_mask, 16);
281 
282    radv_store_availability(&b, flags, dst_buf,
283                            nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
284                            available32);
285 
286    nir_push_if(&b, nir_i2b(&b, available32));
287 
288    nir_store_var(&b, output_offset, output_base, 0x1);
289    for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
290       nir_push_if(&b, nir_test_mask(&b, stats_mask, BITFIELD64_BIT(i)));
291 
292       nir_ssa_def *start_offset = nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8);
293       nir_ssa_def *start = nir_load_ssbo(&b, 1, 64, src_buf, start_offset);
294 
295       nir_ssa_def *end_offset =
296          nir_iadd_imm(&b, input_base, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size);
297       nir_ssa_def *end = nir_load_ssbo(&b, 1, 64, src_buf, end_offset);
298 
299       nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
300 
301       nir_push_if(&b, nir_iand(&b, nir_i2b(&b, uses_gds),
302                                nir_ieq(&b, nir_imm_int(&b, 1u << i),
303                                        nir_imm_int(&b, VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))));
304       {
305          /* Compute the GDS result if needed. */
306          nir_ssa_def *gds_start_offset =
307             nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2));
308          nir_ssa_def *gds_start = nir_load_ssbo(&b, 1, 64, src_buf, gds_start_offset);
309 
310          nir_ssa_def *gds_end_offset =
311             nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 2 + 8));
312          nir_ssa_def *gds_end = nir_load_ssbo(&b, 1, 64, src_buf, gds_end_offset);
313 
314          nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
315 
316          nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_gds_result), 0x1);
317       }
318       nir_pop_if(&b, NULL);
319 
320       /* Store result */
321       nir_push_if(&b, result_is_64bit);
322 
323       nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, nir_load_var(&b, output_offset));
324 
325       nir_push_else(&b, NULL);
326 
327       nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, nir_load_var(&b, output_offset));
328 
329       nir_pop_if(&b, NULL);
330 
331       nir_store_var(&b, output_offset, nir_iadd(&b, nir_load_var(&b, output_offset), elem_size),
332                     0x1);
333 
334       nir_pop_if(&b, NULL);
335    }
336 
337    nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
338 
339    nir_push_if(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
340 
341    /* Stores zeros in all outputs. */
342 
343    nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
344    nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
345 
346    nir_loop *loop = nir_push_loop(&b);
347 
348    nir_ssa_def *current_counter = nir_load_var(&b, counter);
349    radv_break_on_count(&b, counter, elem_count);
350 
351    nir_ssa_def *output_elem = nir_iadd(&b, output_base, nir_imul(&b, elem_size, current_counter));
352    nir_push_if(&b, result_is_64bit);
353 
354    nir_store_ssbo(&b, nir_imm_int64(&b, 0), dst_buf, output_elem);
355 
356    nir_push_else(&b, NULL);
357 
358    nir_store_ssbo(&b, nir_imm_int(&b, 0), dst_buf, output_elem);
359 
360    nir_pop_if(&b, NULL);
361 
362    nir_pop_loop(&b, loop);
363    nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
364    nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
365    return b.shader;
366 }
367 
368 static nir_shader *
build_tfb_query_shader(struct radv_device * device)369 build_tfb_query_shader(struct radv_device *device)
370 {
371    /* the shader this builds is roughly
372     *
373     * uint32_t src_stride = 32;
374     *
375     * location(binding = 0) buffer dst_buf;
376     * location(binding = 1) buffer src_buf;
377     *
378     * void main() {
379     *	uint64_t result[2] = {};
380     *	bool available = false;
381     *	uint64_t src_offset = src_stride * global_id.x;
382     * 	uint64_t dst_offset = dst_stride * global_id.x;
383     * 	uint64_t *src_data = src_buf[src_offset];
384     *	uint32_t avail = (src_data[0] >> 32) &
385     *			 (src_data[1] >> 32) &
386     *			 (src_data[2] >> 32) &
387     *			 (src_data[3] >> 32);
388     *	if (avail & 0x80000000) {
389     *		result[0] = src_data[3] - src_data[1];
390     *		result[1] = src_data[2] - src_data[0];
391     *		available = true;
392     *	}
393     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
394     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
395     *		if (flags & VK_QUERY_RESULT_64_BIT) {
396     *			dst_buf[dst_offset] = result;
397     *		} else {
398     *			dst_buf[dst_offset] = (uint32_t)result;
399     *		}
400     *	}
401     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
402     *		dst_buf[dst_offset + result_size] = available;
403     * 	}
404     * }
405     */
406    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "tfb_query");
407    b.shader->info.workgroup_size[0] = 64;
408 
409    /* Create and initialize local variables. */
410    nir_variable *result =
411       nir_local_variable_create(b.impl, glsl_vector_type(GLSL_TYPE_UINT64, 2), "result");
412    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
413 
414    nir_store_var(&b, result, nir_vec2(&b, nir_imm_int64(&b, 0), nir_imm_int64(&b, 0)), 0x3);
415    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
416 
417    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
418 
419    /* Load resources. */
420    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
421    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
422 
423    /* Compute global ID. */
424    nir_ssa_def *global_id = get_global_ids(&b, 1);
425 
426    /* Compute src/dst strides. */
427    nir_ssa_def *input_stride = nir_imm_int(&b, 32);
428    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
429    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
430    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
431 
432    /* Load data from the query pool. */
433    nir_ssa_def *load1 = nir_load_ssbo(&b, 4, 32, src_buf, input_base, .align_mul = 32);
434    nir_ssa_def *load2 =
435       nir_load_ssbo(&b, 4, 32, src_buf, nir_iadd_imm(&b, input_base, 16), .align_mul = 16);
436 
437    /* Check if result is available. */
438    nir_ssa_def *avails[2];
439    avails[0] = nir_iand(&b, nir_channel(&b, load1, 1), nir_channel(&b, load1, 3));
440    avails[1] = nir_iand(&b, nir_channel(&b, load2, 1), nir_channel(&b, load2, 3));
441    nir_ssa_def *result_is_available =
442       nir_test_mask(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000);
443 
444    /* Only compute result if available. */
445    nir_push_if(&b, result_is_available);
446 
447    /* Pack values. */
448    nir_ssa_def *packed64[4];
449    packed64[0] =
450       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 0), nir_channel(&b, load1, 1)));
451    packed64[1] =
452       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 2), nir_channel(&b, load1, 3)));
453    packed64[2] =
454       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 0), nir_channel(&b, load2, 1)));
455    packed64[3] =
456       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 2), nir_channel(&b, load2, 3)));
457 
458    /* Compute result. */
459    nir_ssa_def *num_primitive_written = nir_isub(&b, packed64[3], packed64[1]);
460    nir_ssa_def *primitive_storage_needed = nir_isub(&b, packed64[2], packed64[0]);
461 
462    nir_store_var(&b, result, nir_vec2(&b, num_primitive_written, primitive_storage_needed), 0x3);
463    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
464 
465    nir_pop_if(&b, NULL);
466 
467    /* Determine if result is 64 or 32 bit. */
468    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
469    nir_ssa_def *result_size =
470       nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
471 
472    /* Store the result if complete or partial results have been requested. */
473    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
474                            nir_load_var(&b, available)));
475 
476    /* Store result. */
477    nir_push_if(&b, result_is_64bit);
478 
479    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
480 
481    nir_push_else(&b, NULL);
482 
483    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
484 
485    nir_pop_if(&b, NULL);
486    nir_pop_if(&b, NULL);
487 
488    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
489                            nir_b2i32(&b, nir_load_var(&b, available)));
490 
491    return b.shader;
492 }
493 
494 static nir_shader *
build_timestamp_query_shader(struct radv_device * device)495 build_timestamp_query_shader(struct radv_device *device)
496 {
497    /* the shader this builds is roughly
498     *
499     * uint32_t src_stride = 8;
500     *
501     * location(binding = 0) buffer dst_buf;
502     * location(binding = 1) buffer src_buf;
503     *
504     * void main() {
505     *	uint64_t result = 0;
506     *	bool available = false;
507     *	uint64_t src_offset = src_stride * global_id.x;
508     * 	uint64_t dst_offset = dst_stride * global_id.x;
509     * 	uint64_t timestamp = src_buf[src_offset];
510     *	if (timestamp != TIMESTAMP_NOT_READY) {
511     *		result = timestamp;
512     *		available = true;
513     *	}
514     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
515     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
516     *		if (flags & VK_QUERY_RESULT_64_BIT) {
517     *			dst_buf[dst_offset] = result;
518     *		} else {
519     *			dst_buf[dst_offset] = (uint32_t)result;
520     *		}
521     *	}
522     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
523     *		dst_buf[dst_offset + result_size] = available;
524     * 	}
525     * }
526     */
527    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "timestamp_query");
528    b.shader->info.workgroup_size[0] = 64;
529 
530    /* Create and initialize local variables. */
531    nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
532    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
533 
534    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
535    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
536 
537    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 4);
538 
539    /* Load resources. */
540    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
541    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
542 
543    /* Compute global ID. */
544    nir_ssa_def *global_id = get_global_ids(&b, 1);
545 
546    /* Compute src/dst strides. */
547    nir_ssa_def *input_stride = nir_imm_int(&b, 8);
548    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
549    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 8);
550    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
551 
552    /* Load data from the query pool. */
553    nir_ssa_def *load = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 8);
554 
555    /* Pack the timestamp. */
556    nir_ssa_def *timestamp;
557    timestamp =
558       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load, 0), nir_channel(&b, load, 1)));
559 
560    /* Check if result is available. */
561    nir_ssa_def *result_is_available = nir_i2b(&b, nir_ine_imm(&b, timestamp, TIMESTAMP_NOT_READY));
562 
563    /* Only store result if available. */
564    nir_push_if(&b, result_is_available);
565 
566    nir_store_var(&b, result, timestamp, 0x1);
567    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
568 
569    nir_pop_if(&b, NULL);
570 
571    /* Determine if result is 64 or 32 bit. */
572    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
573    nir_ssa_def *result_size =
574       nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
575 
576    /* Store the result if complete or partial results have been requested. */
577    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
578                            nir_load_var(&b, available)));
579 
580    /* Store result. */
581    nir_push_if(&b, result_is_64bit);
582 
583    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
584 
585    nir_push_else(&b, NULL);
586 
587    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
588 
589    nir_pop_if(&b, NULL);
590 
591    nir_pop_if(&b, NULL);
592 
593    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
594                            nir_b2i32(&b, nir_load_var(&b, available)));
595 
596    return b.shader;
597 }
598 
599 static nir_shader *
build_pg_query_shader(struct radv_device * device)600 build_pg_query_shader(struct radv_device *device)
601 {
602    /* the shader this builds is roughly
603     *
604     * uint32_t src_stride = 32;
605     *
606     * location(binding = 0) buffer dst_buf;
607     * location(binding = 1) buffer src_buf;
608     *
609     * void main() {
610     *	uint64_t result = {};
611     *	bool available = false;
612     *	uint64_t src_offset = src_stride * global_id.x;
613     * 	uint64_t dst_offset = dst_stride * global_id.x;
614     * 	uint64_t *src_data = src_buf[src_offset];
615     *	uint32_t avail = (src_data[0] >> 32) &
616     *			 (src_data[2] >> 32);
617     *	if (avail & 0x80000000) {
618     *		result = src_data[2] - src_data[0];
619     *	        if (use_gds) {
620     *			uint64_t ngg_gds_result = 0;
621     *			ngg_gds_result += src_data[5] - src_data[4];
622     *			ngg_gds_result += src_data[7] - src_data[6];
623     *			result += ngg_gds_result;
624     *	        }
625     *		available = true;
626     *	}
627     * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
628     * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
629     *		if (flags & VK_QUERY_RESULT_64_BIT) {
630     *			dst_buf[dst_offset] = result;
631     *		} else {
632     *			dst_buf[dst_offset] = (uint32_t)result;
633     *		}
634     *	}
635     *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
636     *		dst_buf[dst_offset + result_size] = available;
637     * 	}
638     * }
639     */
640    nir_builder b = radv_meta_init_shader(device, MESA_SHADER_COMPUTE, "pg_query");
641    b.shader->info.workgroup_size[0] = 64;
642 
643    /* Create and initialize local variables. */
644    nir_variable *result =
645       nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
646    nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
647 
648    nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
649    nir_store_var(&b, available, nir_imm_false(&b), 0x1);
650 
651    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .range = 16);
652 
653    /* Load resources. */
654    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
655    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
656 
657    /* Compute global ID. */
658    nir_ssa_def *global_id = get_global_ids(&b, 1);
659 
660    /* Compute src/dst strides. */
661    nir_ssa_def *input_stride = nir_imm_int(&b, 32);
662    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
663    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 4), .range = 16);
664    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
665 
666    /* Load data from the query pool. */
667    nir_ssa_def *load1 = nir_load_ssbo(&b, 2, 32, src_buf, input_base, .align_mul = 32);
668    nir_ssa_def *load2 = nir_load_ssbo(
669       &b, 2, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 16)), .align_mul = 16);
670 
671    /* Check if result is available. */
672    nir_ssa_def *avails[2];
673    avails[0] = nir_channel(&b, load1, 1);
674    avails[1] = nir_channel(&b, load2, 1);
675    nir_ssa_def *result_is_available =
676       nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), nir_imm_int(&b, 0x80000000)));
677 
678    /* Only compute result if available. */
679    nir_push_if(&b, result_is_available);
680 
681    /* Pack values. */
682    nir_ssa_def *packed64[2];
683    packed64[0] =
684       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load1, 0), nir_channel(&b, load1, 1)));
685    packed64[1] =
686       nir_pack_64_2x32(&b, nir_vec2(&b, nir_channel(&b, load2, 0), nir_channel(&b, load2, 1)));
687 
688    /* Compute result. */
689    nir_ssa_def *primitive_storage_needed = nir_isub(&b, packed64[1], packed64[0]);
690 
691    nir_store_var(&b, result, primitive_storage_needed, 0x1);
692 
693    nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
694    nir_push_if(&b, nir_i2b(&b, uses_gds));
695    {
696       /* NGG GS result */
697       nir_ssa_def *gds_start =
698          nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 8);
699       nir_ssa_def *gds_end =
700          nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 8);
701 
702       nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
703 
704       /* NGG VS/TES result */
705       gds_start =
706          nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 48)), .align_mul = 8);
707       gds_end =
708          nir_load_ssbo(&b, 1, 64, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 56)), .align_mul = 8);
709 
710       ngg_gds_result = nir_iadd(&b, ngg_gds_result, nir_isub(&b, gds_end, gds_start));
711 
712       nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), ngg_gds_result), 0x1);
713    }
714    nir_pop_if(&b, NULL);
715 
716    nir_store_var(&b, available, nir_imm_true(&b), 0x1);
717 
718    nir_pop_if(&b, NULL);
719 
720    /* Determine if result is 64 or 32 bit. */
721    nir_ssa_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
722    nir_ssa_def *result_size =
723       nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16), nir_imm_int(&b, 8));
724 
725    /* Store the result if complete or partial results have been requested. */
726    nir_push_if(&b, nir_ior(&b, nir_test_mask(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
727                            nir_load_var(&b, available)));
728 
729    /* Store result. */
730    nir_push_if(&b, result_is_64bit);
731 
732    nir_store_ssbo(&b, nir_load_var(&b, result), dst_buf, output_base);
733 
734    nir_push_else(&b, NULL);
735 
736    nir_store_ssbo(&b, nir_u2u32(&b, nir_load_var(&b, result)), dst_buf, output_base);
737 
738    nir_pop_if(&b, NULL);
739    nir_pop_if(&b, NULL);
740 
741    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, result_size, output_base),
742                            nir_b2i32(&b, nir_load_var(&b, available)));
743 
744    return b.shader;
745 }
746 
747 static VkResult
radv_device_init_meta_query_state_internal(struct radv_device * device)748 radv_device_init_meta_query_state_internal(struct radv_device *device)
749 {
750    VkResult result;
751    nir_shader *occlusion_cs = NULL;
752    nir_shader *pipeline_statistics_cs = NULL;
753    nir_shader *tfb_cs = NULL;
754    nir_shader *timestamp_cs = NULL;
755    nir_shader *pg_cs = NULL;
756 
757    mtx_lock(&device->meta_state.mtx);
758    if (device->meta_state.query.pipeline_statistics_query_pipeline) {
759       mtx_unlock(&device->meta_state.mtx);
760       return VK_SUCCESS;
761    }
762    occlusion_cs = build_occlusion_query_shader(device);
763    pipeline_statistics_cs = build_pipeline_statistics_query_shader(device);
764    tfb_cs = build_tfb_query_shader(device);
765    timestamp_cs = build_timestamp_query_shader(device);
766    pg_cs = build_pg_query_shader(device);
767 
768    VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
769       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
770       .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
771       .bindingCount = 2,
772       .pBindings = (VkDescriptorSetLayoutBinding[]){
773          {.binding = 0,
774           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
775           .descriptorCount = 1,
776           .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
777           .pImmutableSamplers = NULL},
778          {.binding = 1,
779           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
780           .descriptorCount = 1,
781           .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
782           .pImmutableSamplers = NULL},
783       }};
784 
785    result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &occlusion_ds_create_info,
786                                            &device->meta_state.alloc,
787                                            &device->meta_state.query.ds_layout);
788    if (result != VK_SUCCESS)
789       goto fail;
790 
791    VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
792       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
793       .setLayoutCount = 1,
794       .pSetLayouts = &device->meta_state.query.ds_layout,
795       .pushConstantRangeCount = 1,
796       .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 20},
797    };
798 
799    result =
800       radv_CreatePipelineLayout(radv_device_to_handle(device), &occlusion_pl_create_info,
801                                 &device->meta_state.alloc, &device->meta_state.query.p_layout);
802    if (result != VK_SUCCESS)
803       goto fail;
804 
805    VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
806       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
807       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
808       .module = vk_shader_module_handle_from_nir(occlusion_cs),
809       .pName = "main",
810       .pSpecializationInfo = NULL,
811    };
812 
813    VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
814       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
815       .stage = occlusion_pipeline_shader_stage,
816       .flags = 0,
817       .layout = device->meta_state.query.p_layout,
818    };
819 
820    result = radv_CreateComputePipelines(
821       radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
822       &occlusion_vk_pipeline_info, NULL, &device->meta_state.query.occlusion_query_pipeline);
823    if (result != VK_SUCCESS)
824       goto fail;
825 
826    VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
827       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
828       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
829       .module = vk_shader_module_handle_from_nir(pipeline_statistics_cs),
830       .pName = "main",
831       .pSpecializationInfo = NULL,
832    };
833 
834    VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
835       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
836       .stage = pipeline_statistics_pipeline_shader_stage,
837       .flags = 0,
838       .layout = device->meta_state.query.p_layout,
839    };
840 
841    result = radv_CreateComputePipelines(
842       radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
843       &pipeline_statistics_vk_pipeline_info, NULL,
844       &device->meta_state.query.pipeline_statistics_query_pipeline);
845    if (result != VK_SUCCESS)
846       goto fail;
847 
848    VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
849       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
850       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
851       .module = vk_shader_module_handle_from_nir(tfb_cs),
852       .pName = "main",
853       .pSpecializationInfo = NULL,
854    };
855 
856    VkComputePipelineCreateInfo tfb_pipeline_info = {
857       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
858       .stage = tfb_pipeline_shader_stage,
859       .flags = 0,
860       .layout = device->meta_state.query.p_layout,
861    };
862 
863    result = radv_CreateComputePipelines(
864       radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
865       &tfb_pipeline_info, NULL, &device->meta_state.query.tfb_query_pipeline);
866    if (result != VK_SUCCESS)
867       goto fail;
868 
869    VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = {
870       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
871       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
872       .module = vk_shader_module_handle_from_nir(timestamp_cs),
873       .pName = "main",
874       .pSpecializationInfo = NULL,
875    };
876 
877    VkComputePipelineCreateInfo timestamp_pipeline_info = {
878       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
879       .stage = timestamp_pipeline_shader_stage,
880       .flags = 0,
881       .layout = device->meta_state.query.p_layout,
882    };
883 
884    result = radv_CreateComputePipelines(
885       radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
886       &timestamp_pipeline_info, NULL, &device->meta_state.query.timestamp_query_pipeline);
887    if (result != VK_SUCCESS)
888       goto fail;
889 
890    VkPipelineShaderStageCreateInfo pg_pipeline_shader_stage = {
891       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
892       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
893       .module = vk_shader_module_handle_from_nir(pg_cs),
894       .pName = "main",
895       .pSpecializationInfo = NULL,
896    };
897 
898    VkComputePipelineCreateInfo pg_pipeline_info = {
899       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
900       .stage = pg_pipeline_shader_stage,
901       .flags = 0,
902       .layout = device->meta_state.query.p_layout,
903    };
904 
905    result = radv_CreateComputePipelines(
906       radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
907       &pg_pipeline_info, NULL, &device->meta_state.query.pg_query_pipeline);
908 
909 fail:
910    ralloc_free(occlusion_cs);
911    ralloc_free(pipeline_statistics_cs);
912    ralloc_free(tfb_cs);
913    ralloc_free(pg_cs);
914    ralloc_free(timestamp_cs);
915    mtx_unlock(&device->meta_state.mtx);
916    return result;
917 }
918 
919 VkResult
radv_device_init_meta_query_state(struct radv_device * device,bool on_demand)920 radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
921 {
922    if (on_demand)
923       return VK_SUCCESS;
924 
925    return radv_device_init_meta_query_state_internal(device);
926 }
927 
928 void
radv_device_finish_meta_query_state(struct radv_device * device)929 radv_device_finish_meta_query_state(struct radv_device *device)
930 {
931    if (device->meta_state.query.tfb_query_pipeline)
932       radv_DestroyPipeline(radv_device_to_handle(device),
933                            device->meta_state.query.tfb_query_pipeline, &device->meta_state.alloc);
934 
935    if (device->meta_state.query.pipeline_statistics_query_pipeline)
936       radv_DestroyPipeline(radv_device_to_handle(device),
937                            device->meta_state.query.pipeline_statistics_query_pipeline,
938                            &device->meta_state.alloc);
939 
940    if (device->meta_state.query.occlusion_query_pipeline)
941       radv_DestroyPipeline(radv_device_to_handle(device),
942                            device->meta_state.query.occlusion_query_pipeline,
943                            &device->meta_state.alloc);
944 
945    if (device->meta_state.query.timestamp_query_pipeline)
946       radv_DestroyPipeline(radv_device_to_handle(device),
947                            device->meta_state.query.timestamp_query_pipeline,
948                            &device->meta_state.alloc);
949 
950    if (device->meta_state.query.pg_query_pipeline)
951       radv_DestroyPipeline(radv_device_to_handle(device),
952                            device->meta_state.query.pg_query_pipeline, &device->meta_state.alloc);
953 
954    if (device->meta_state.query.p_layout)
955       radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.query.p_layout,
956                                  &device->meta_state.alloc);
957 
958    if (device->meta_state.query.ds_layout)
959       device->vk.dispatch_table.DestroyDescriptorSetLayout(radv_device_to_handle(device),
960                                                            device->meta_state.query.ds_layout,
961                                                            &device->meta_state.alloc);
962 }
963 
964 static void
radv_query_shader(struct radv_cmd_buffer * cmd_buffer,VkPipeline * pipeline,struct radeon_winsys_bo * src_bo,struct radeon_winsys_bo * dst_bo,uint64_t src_offset,uint64_t dst_offset,uint32_t src_stride,uint32_t dst_stride,size_t dst_size,uint32_t count,uint32_t flags,uint32_t pipeline_stats_mask,uint32_t avail_offset,bool uses_gds)965 radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline,
966                   struct radeon_winsys_bo *src_bo, struct radeon_winsys_bo *dst_bo,
967                   uint64_t src_offset, uint64_t dst_offset, uint32_t src_stride,
968                   uint32_t dst_stride, size_t dst_size, uint32_t count, uint32_t flags,
969                   uint32_t pipeline_stats_mask, uint32_t avail_offset, bool uses_gds)
970 {
971    struct radv_device *device = cmd_buffer->device;
972    struct radv_meta_saved_state saved_state;
973    struct radv_buffer src_buffer, dst_buffer;
974 
975    if (!*pipeline) {
976       VkResult ret = radv_device_init_meta_query_state_internal(device);
977       if (ret != VK_SUCCESS) {
978          cmd_buffer->record_result = ret;
979          return;
980       }
981    }
982 
983    /* VK_EXT_conditional_rendering says that copy commands should not be
984     * affected by conditional rendering.
985     */
986    radv_meta_save(&saved_state, cmd_buffer,
987                   RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS |
988                      RADV_META_SAVE_DESCRIPTORS | RADV_META_SUSPEND_PREDICATING);
989 
990    uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset);
991    uint64_t dst_buffer_size = dst_stride * (count - 1) + dst_size;
992 
993    radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset);
994    radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset);
995 
996    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
997                         *pipeline);
998 
999    radv_meta_push_descriptor_set(
1000       cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.query.p_layout, 0, /* set */
1001       2, /* descriptorWriteCount */
1002       (VkWriteDescriptorSet[]){
1003          {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1004           .dstBinding = 0,
1005           .dstArrayElement = 0,
1006           .descriptorCount = 1,
1007           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1008           .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&dst_buffer),
1009                                                    .offset = 0,
1010                                                    .range = VK_WHOLE_SIZE}},
1011          {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1012           .dstBinding = 1,
1013           .dstArrayElement = 0,
1014           .descriptorCount = 1,
1015           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1016           .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&src_buffer),
1017                                                    .offset = 0,
1018                                                    .range = VK_WHOLE_SIZE}}});
1019 
1020    /* Encode the number of elements for easy access by the shader. */
1021    pipeline_stats_mask &= 0x7ff;
1022    pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1023 
1024    avail_offset -= src_offset;
1025 
1026    struct {
1027       uint32_t flags;
1028       uint32_t dst_stride;
1029       uint32_t pipeline_stats_mask;
1030       uint32_t avail_offset;
1031       uint32_t uses_gds;
1032    } push_constants = {flags, dst_stride, pipeline_stats_mask, avail_offset, uses_gds};
1033 
1034    radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.query.p_layout,
1035                          VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
1036 
1037    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1038 
1039    if (flags & VK_QUERY_RESULT_WAIT_BIT)
1040       cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1041 
1042    radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1043 
1044    /* Ensure that the query copy dispatch is complete before a potential vkCmdResetPool because
1045     * there is an implicit execution dependency from each such query command to all query commands
1046     * previously submitted to the same queue.
1047     */
1048    cmd_buffer->active_query_flush_bits |=
1049       RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
1050 
1051    radv_buffer_finish(&src_buffer);
1052    radv_buffer_finish(&dst_buffer);
1053 
1054    radv_meta_restore(&saved_state, cmd_buffer);
1055 }
1056 
1057 static void
radv_destroy_query_pool(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_query_pool * pool)1058 radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
1059                         struct radv_query_pool *pool)
1060 {
1061    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
1062       radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool);
1063 
1064    if (pool->bo)
1065       device->ws->buffer_destroy(device->ws, pool->bo);
1066    vk_object_base_finish(&pool->base);
1067    vk_free2(&device->vk.alloc, pAllocator, pool);
1068 }
1069 
1070 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)1071 radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
1072                      const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
1073 {
1074    RADV_FROM_HANDLE(radv_device, device, _device);
1075    VkResult result;
1076    size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
1077                                 ? sizeof(struct radv_pc_query_pool)
1078                                 : sizeof(struct radv_query_pool);
1079 
1080    struct radv_query_pool *pool = vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8,
1081                                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1082 
1083    if (!pool)
1084       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1085 
1086    vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL);
1087 
1088    pool->type = pCreateInfo->queryType;
1089    pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics;
1090 
1091    /* The number of primitives generated by geometry shader invocations is only counted by the
1092     * hardware if GS uses the legacy path. When NGG GS is used, the hardware can't know the number
1093     * of generated primitives and we have to increment it from the shader using a plain GDS atomic.
1094     */
1095    pool->uses_gds = device->physical_device->use_ngg &&
1096                     ((pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) ||
1097                      pCreateInfo->queryType == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1098 
1099    switch (pCreateInfo->queryType) {
1100    case VK_QUERY_TYPE_OCCLUSION:
1101       pool->stride = 16 * device->physical_device->rad_info.max_render_backends;
1102       break;
1103    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1104       pool->stride = pipelinestat_block_size * 2;
1105       if (pool->uses_gds) {
1106          /* When the query pool needs GDS (for counting the number of primitives generated by a
1107           * geometry shader with NGG), allocate 2x64-bit values for begin/end.
1108           */
1109          pool->stride += 8 * 2;
1110       }
1111       break;
1112    case VK_QUERY_TYPE_TIMESTAMP:
1113    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1114    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1115    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1116    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1117       pool->stride = 8;
1118       break;
1119    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1120    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1121       pool->stride = 32;
1122       if (pool->uses_gds) {
1123          /* When the query pool needs GDS, allocate 4x64-bit values for begin/end of NGG GS and
1124           * NGG VS/TES because they use a different offset.
1125           */
1126          pool->stride += 8 * 4;
1127       }
1128       break;
1129    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1130       result = radv_pc_init_query_pool(device->physical_device, pCreateInfo,
1131                                        (struct radv_pc_query_pool *)pool);
1132 
1133       if (result != VK_SUCCESS) {
1134          radv_destroy_query_pool(device, pAllocator, pool);
1135          return vk_error(device, result);
1136       }
1137       break;
1138    }
1139    default:
1140       unreachable("creating unhandled query type");
1141    }
1142 
1143    pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1144    pool->size = pool->availability_offset;
1145    if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
1146       pool->size += 4 * pCreateInfo->queryCount;
1147 
1148    result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT,
1149                                       RADEON_FLAG_NO_INTERPROCESS_SHARING,
1150                                       RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo);
1151    if (result != VK_SUCCESS) {
1152       radv_destroy_query_pool(device, pAllocator, pool);
1153       return vk_error(device, result);
1154    }
1155 
1156    pool->ptr = device->ws->buffer_map(pool->bo);
1157    if (!pool->ptr) {
1158       radv_destroy_query_pool(device, pAllocator, pool);
1159       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1160    }
1161 
1162    *pQueryPool = radv_query_pool_to_handle(pool);
1163    return VK_SUCCESS;
1164 }
1165 
1166 VKAPI_ATTR void VKAPI_CALL
radv_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)1167 radv_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator)
1168 {
1169    RADV_FROM_HANDLE(radv_device, device, _device);
1170    RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
1171 
1172    if (!pool)
1173       return;
1174 
1175    radv_destroy_query_pool(device, pAllocator, pool);
1176 }
1177 
1178 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)1179 radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery,
1180                          uint32_t queryCount, size_t dataSize, void *pData, VkDeviceSize stride,
1181                          VkQueryResultFlags flags)
1182 {
1183    RADV_FROM_HANDLE(radv_device, device, _device);
1184    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1185    char *data = pData;
1186    VkResult result = VK_SUCCESS;
1187 
1188    if (vk_device_is_lost(&device->vk))
1189       return VK_ERROR_DEVICE_LOST;
1190 
1191    for (unsigned query_idx = 0; query_idx < queryCount; ++query_idx, data += stride) {
1192       char *dest = data;
1193       unsigned query = firstQuery + query_idx;
1194       char *src = pool->ptr + query * pool->stride;
1195       uint32_t available;
1196 
1197       switch (pool->type) {
1198       case VK_QUERY_TYPE_TIMESTAMP:
1199       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1200       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1201       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1202       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: {
1203          uint64_t const *src64 = (uint64_t const *)src;
1204          uint64_t value;
1205 
1206          do {
1207             value = p_atomic_read(src64);
1208          } while (value == TIMESTAMP_NOT_READY && (flags & VK_QUERY_RESULT_WAIT_BIT));
1209 
1210          available = value != TIMESTAMP_NOT_READY;
1211 
1212          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1213             result = VK_NOT_READY;
1214 
1215          if (flags & VK_QUERY_RESULT_64_BIT) {
1216             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1217                *(uint64_t *)dest = value;
1218             dest += 8;
1219          } else {
1220             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1221                *(uint32_t *)dest = (uint32_t)value;
1222             dest += 4;
1223          }
1224          break;
1225       }
1226       case VK_QUERY_TYPE_OCCLUSION: {
1227          uint64_t const *src64 = (uint64_t const *)src;
1228          uint32_t db_count = device->physical_device->rad_info.max_render_backends;
1229          uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
1230          uint64_t sample_count = 0;
1231          available = 1;
1232 
1233          for (int i = 0; i < db_count; ++i) {
1234             uint64_t start, end;
1235 
1236             if (!(enabled_rb_mask & (1 << i)))
1237                continue;
1238 
1239             do {
1240                start = p_atomic_read(src64 + 2 * i);
1241                end = p_atomic_read(src64 + 2 * i + 1);
1242             } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) &&
1243                      (flags & VK_QUERY_RESULT_WAIT_BIT));
1244 
1245             if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1246                available = 0;
1247             else {
1248                sample_count += end - start;
1249             }
1250          }
1251 
1252          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1253             result = VK_NOT_READY;
1254 
1255          if (flags & VK_QUERY_RESULT_64_BIT) {
1256             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1257                *(uint64_t *)dest = sample_count;
1258             dest += 8;
1259          } else {
1260             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1261                *(uint32_t *)dest = sample_count;
1262             dest += 4;
1263          }
1264          break;
1265       }
1266       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1267          const uint32_t *avail_ptr =
1268             (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query);
1269          uint64_t ngg_gds_result = 0;
1270 
1271          do {
1272             available = p_atomic_read(avail_ptr);
1273          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT));
1274 
1275          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1276             result = VK_NOT_READY;
1277 
1278          if (pool->uses_gds) {
1279             /* Compute the result that was copied from GDS. */
1280             const uint64_t *gds_start = (uint64_t *)(src + pipelinestat_block_size * 2);
1281             const uint64_t *gds_stop = (uint64_t *)(src + pipelinestat_block_size * 2 + 8);
1282 
1283             ngg_gds_result = gds_stop[0] - gds_start[0];
1284          }
1285 
1286          const uint64_t *start = (uint64_t *)src;
1287          const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
1288          if (flags & VK_QUERY_RESULT_64_BIT) {
1289             uint64_t *dst = (uint64_t *)dest;
1290             dest += util_bitcount(pool->pipeline_stats_mask) * 8;
1291             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1292                if (pool->pipeline_stats_mask & (1u << i)) {
1293                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1294                      *dst = stop[pipeline_statistics_indices[i]] -
1295                             start[pipeline_statistics_indices[i]];
1296 
1297                      if (pool->uses_gds &&
1298                          (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
1299                         *dst += ngg_gds_result;
1300                      }
1301                   }
1302                   dst++;
1303                }
1304             }
1305 
1306          } else {
1307             uint32_t *dst = (uint32_t *)dest;
1308             dest += util_bitcount(pool->pipeline_stats_mask) * 4;
1309             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
1310                if (pool->pipeline_stats_mask & (1u << i)) {
1311                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
1312                      *dst = stop[pipeline_statistics_indices[i]] -
1313                             start[pipeline_statistics_indices[i]];
1314 
1315                      if (pool->uses_gds &&
1316                          (1u << i) == VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
1317                         *dst += ngg_gds_result;
1318                      }
1319                   }
1320                   dst++;
1321                }
1322             }
1323          }
1324          break;
1325       }
1326       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1327          uint64_t const *src64 = (uint64_t const *)src;
1328          uint64_t num_primitives_written;
1329          uint64_t primitive_storage_needed;
1330 
1331          /* SAMPLE_STREAMOUTSTATS stores this structure:
1332           * {
1333           *	u64 NumPrimitivesWritten;
1334           *	u64 PrimitiveStorageNeeded;
1335           * }
1336           */
1337          available = 1;
1338          for (int j = 0; j < 4; j++) {
1339             if (!(p_atomic_read(src64 + j) & 0x8000000000000000UL))
1340                available = 0;
1341          }
1342 
1343          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1344             result = VK_NOT_READY;
1345 
1346          num_primitives_written = src64[3] - src64[1];
1347          primitive_storage_needed = src64[2] - src64[0];
1348 
1349          if (flags & VK_QUERY_RESULT_64_BIT) {
1350             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1351                *(uint64_t *)dest = num_primitives_written;
1352             dest += 8;
1353             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1354                *(uint64_t *)dest = primitive_storage_needed;
1355             dest += 8;
1356          } else {
1357             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1358                *(uint32_t *)dest = num_primitives_written;
1359             dest += 4;
1360             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1361                *(uint32_t *)dest = primitive_storage_needed;
1362             dest += 4;
1363          }
1364          break;
1365       }
1366       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1367          uint64_t const *src64 = (uint64_t const *)src;
1368          uint64_t primitive_storage_needed;
1369 
1370          /* SAMPLE_STREAMOUTSTATS stores this structure:
1371           * {
1372           *	u64 NumPrimitivesWritten;
1373           *	u64 PrimitiveStorageNeeded;
1374           * }
1375           */
1376          available = 1;
1377          if (!(p_atomic_read(src64 + 0) & 0x8000000000000000UL) ||
1378              !(p_atomic_read(src64 + 2) & 0x8000000000000000UL)) {
1379             available = 0;
1380          }
1381 
1382          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1383             result = VK_NOT_READY;
1384 
1385          primitive_storage_needed = src64[2] - src64[0];
1386 
1387          if (pool->uses_gds) {
1388             /* Accumulate the result that was copied from GDS in case NGG GS or NGG VS/TES have been
1389              * used.
1390              */
1391             primitive_storage_needed += src64[5] - src64[4]; /* NGG GS */
1392             primitive_storage_needed += src64[7] - src64[6]; /* NGG VS/TES */
1393          }
1394 
1395          if (flags & VK_QUERY_RESULT_64_BIT) {
1396             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1397                *(uint64_t *)dest = primitive_storage_needed;
1398             dest += 8;
1399          } else {
1400             if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1401                *(uint32_t *)dest = primitive_storage_needed;
1402             dest += 4;
1403          }
1404          break;
1405       }
1406       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1407          struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool;
1408          const uint64_t *src64 = (const uint64_t *)src;
1409          bool avail;
1410          do {
1411             avail = true;
1412             for (unsigned i = 0; i < pc_pool->num_passes; ++i)
1413                if (!p_atomic_read(src64 + pool->stride / 8 - i - 1))
1414                   avail = false;
1415          } while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT));
1416 
1417          available = avail;
1418 
1419          radv_pc_get_results(pc_pool, src64, dest);
1420          dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR);
1421          break;
1422       }
1423       default:
1424          unreachable("trying to get results of unhandled query type");
1425       }
1426 
1427       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1428          if (flags & VK_QUERY_RESULT_64_BIT) {
1429             *(uint64_t *)dest = available;
1430          } else {
1431             *(uint32_t *)dest = available;
1432          }
1433       }
1434    }
1435 
1436    return result;
1437 }
1438 
1439 static void
emit_query_flush(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool)1440 emit_query_flush(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool)
1441 {
1442    if (cmd_buffer->pending_reset_query) {
1443       if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1444          /* Only need to flush caches if the query pool size is
1445           * large enough to be resetted using the compute shader
1446           * path. Small pools don't need any cache flushes
1447           * because we use a CP dma clear.
1448           */
1449          si_emit_cache_flush(cmd_buffer);
1450       }
1451    }
1452 }
1453 
1454 static size_t
radv_query_result_size(const struct radv_query_pool * pool,VkQueryResultFlags flags)1455 radv_query_result_size(const struct radv_query_pool *pool, VkQueryResultFlags flags)
1456 {
1457    unsigned values = (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0;
1458    switch (pool->type) {
1459    case VK_QUERY_TYPE_TIMESTAMP:
1460    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1461    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1462    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1463    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1464    case VK_QUERY_TYPE_OCCLUSION:
1465       values += 1;
1466       break;
1467    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1468       values += util_bitcount(pool->pipeline_stats_mask);
1469       break;
1470    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1471       values += 2;
1472       break;
1473    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1474       values += 1;
1475       break;
1476    default:
1477       unreachable("trying to get size of unhandled query type");
1478    }
1479    return values * ((flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4);
1480 }
1481 
1482 VKAPI_ATTR void VKAPI_CALL
radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1483 radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
1484                              uint32_t firstQuery, uint32_t queryCount, VkBuffer dstBuffer,
1485                              VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags)
1486 {
1487    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1488    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1489    RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1490    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1491    uint64_t va = radv_buffer_get_va(pool->bo);
1492    uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1493    size_t dst_size = radv_query_result_size(pool, flags);
1494    dest_va += dst_buffer->offset + dstOffset;
1495 
1496    if (!queryCount)
1497       return;
1498 
1499    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
1500    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
1501 
1502    /* Workaround engines that forget to properly specify WAIT_BIT because some driver implicitly
1503     * synchronizes before query copy.
1504     */
1505    if (cmd_buffer->device->instance->flush_before_query_copy)
1506       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1507 
1508    /* From the Vulkan spec 1.1.108:
1509     *
1510     * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1511     *  previous uses of vkCmdResetQueryPool in the same queue, without any
1512     *  additional synchronization."
1513     *
1514     * So, we have to flush the caches if the compute shader path was used.
1515     */
1516    emit_query_flush(cmd_buffer, pool);
1517 
1518    switch (pool->type) {
1519    case VK_QUERY_TYPE_OCCLUSION:
1520       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1521          unsigned enabled_rb_mask = cmd_buffer->device->physical_device->rad_info.enabled_rb_mask;
1522          uint32_t rb_avail_offset = 16 * util_last_bit(enabled_rb_mask) - 4;
1523          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1524             unsigned query = firstQuery + i;
1525             uint64_t src_va = va + query * pool->stride + rb_avail_offset;
1526 
1527             radeon_check_space(cmd_buffer->device->ws, cs, 7);
1528 
1529             /* Waits on the upper word of the last DB entry */
1530             radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va, 0x80000000, 0xffffffff);
1531          }
1532       }
1533       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline,
1534                         pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1535                         dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1536                         flags, 0, 0, false);
1537       break;
1538    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1539       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1540          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1541             unsigned query = firstQuery + i;
1542 
1543             radeon_check_space(cmd_buffer->device->ws, cs, 7);
1544 
1545             uint64_t avail_va = va + pool->availability_offset + 4 * query;
1546 
1547             /* This waits on the ME. All copies below are done on the ME */
1548             radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
1549          }
1550       }
1551       radv_query_shader(
1552          cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1553          pool->bo, dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset,
1554          pool->stride, stride, dst_size, queryCount, flags, pool->pipeline_stats_mask,
1555          pool->availability_offset + 4 * firstQuery, pool->uses_gds);
1556       break;
1557    case VK_QUERY_TYPE_TIMESTAMP:
1558    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1559    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1560    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1561    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1562       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1563          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1564             unsigned query = firstQuery + i;
1565             uint64_t local_src_va = va + query * pool->stride;
1566 
1567             radeon_check_space(cmd_buffer->device->ws, cs, 7);
1568 
1569             /* Wait on the high 32 bits of the timestamp in
1570              * case the low part is 0xffffffff.
1571              */
1572             radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL, local_src_va + 4,
1573                              TIMESTAMP_NOT_READY >> 32, 0xffffffff);
1574          }
1575       }
1576 
1577       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline,
1578                         pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1579                         dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1580                         flags, 0, 0, false);
1581       break;
1582    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1583       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1584          for (unsigned i = 0; i < queryCount; i++) {
1585             unsigned query = firstQuery + i;
1586             uint64_t src_va = va + query * pool->stride;
1587 
1588             radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1589 
1590             /* Wait on the upper word of all results. */
1591             for (unsigned j = 0; j < 4; j++, src_va += 8) {
1592                radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000,
1593                                 0xffffffff);
1594             }
1595          }
1596       }
1597 
1598       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline,
1599                         pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1600                         dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1601                         flags, 0, 0, false);
1602       break;
1603    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1604       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1605          for (unsigned i = 0; i < queryCount; i++) {
1606             unsigned query = firstQuery + i;
1607             uint64_t src_va = va + query * pool->stride;
1608 
1609             radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
1610 
1611             /* Wait on the upper word of the PrimitiveStorageNeeded result. */
1612             radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
1613             radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
1614          }
1615       }
1616 
1617       radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pg_query_pipeline,
1618                         pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1619                         dst_buffer->offset + dstOffset, pool->stride, stride, dst_size, queryCount,
1620                         flags, 0, 0, pool->uses_gds);
1621       break;
1622    default:
1623       unreachable("trying to get results of unhandled query type");
1624    }
1625 }
1626 
1627 static uint32_t
query_clear_value(VkQueryType type)1628 query_clear_value(VkQueryType type)
1629 {
1630    switch (type) {
1631    case VK_QUERY_TYPE_TIMESTAMP:
1632    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1633    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1634    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1635    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1636       return (uint32_t)TIMESTAMP_NOT_READY;
1637    default:
1638       return 0;
1639    }
1640 }
1641 
1642 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1643 radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
1644                        uint32_t queryCount)
1645 {
1646    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1647    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1648    uint32_t value = query_clear_value(pool->type);
1649    uint32_t flush_bits = 0;
1650 
1651    /* Make sure to sync all previous work if the given command buffer has
1652     * pending active queries. Otherwise the GPU might write queries data
1653     * after the reset operation.
1654     */
1655    cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1656 
1657    flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo,
1658                                   radv_buffer_get_va(pool->bo) + firstQuery * pool->stride,
1659                                   queryCount * pool->stride, value);
1660 
1661    if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1662       flush_bits |=
1663          radv_fill_buffer(cmd_buffer, NULL, pool->bo,
1664                           radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4,
1665                           queryCount * 4, 0);
1666    }
1667 
1668    if (flush_bits) {
1669       /* Only need to flush caches for the compute shader path. */
1670       cmd_buffer->pending_reset_query = true;
1671       cmd_buffer->state.flush_bits |= flush_bits;
1672    }
1673 }
1674 
1675 VKAPI_ATTR void VKAPI_CALL
radv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1676 radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery,
1677                     uint32_t queryCount)
1678 {
1679    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1680 
1681    uint32_t value = query_clear_value(pool->type);
1682    uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride);
1683    uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1684 
1685    for (uint32_t *p = data; p != data_end; ++p)
1686       *p = value;
1687 
1688    if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1689       memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4);
1690    }
1691 }
1692 
1693 static unsigned
event_type_for_stream(unsigned stream)1694 event_type_for_stream(unsigned stream)
1695 {
1696    switch (stream) {
1697    default:
1698    case 0:
1699       return V_028A90_SAMPLE_STREAMOUTSTATS;
1700    case 1:
1701       return V_028A90_SAMPLE_STREAMOUTSTATS1;
1702    case 2:
1703       return V_028A90_SAMPLE_STREAMOUTSTATS2;
1704    case 3:
1705       return V_028A90_SAMPLE_STREAMOUTSTATS3;
1706    }
1707 }
1708 
1709 static void
emit_sample_streamout(struct radv_cmd_buffer * cmd_buffer,uint64_t va,uint32_t index)1710 emit_sample_streamout(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uint32_t index)
1711 {
1712    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1713 
1714    radeon_check_space(cmd_buffer->device->ws, cs, 4);
1715 
1716    assert(index < MAX_SO_STREAMS);
1717 
1718    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1719    radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1720    radeon_emit(cs, va);
1721    radeon_emit(cs, va >> 32);
1722 }
1723 
1724 static void
gfx10_copy_gds_query(struct radv_cmd_buffer * cmd_buffer,uint32_t gds_offset,uint64_t va)1725 gfx10_copy_gds_query(struct radv_cmd_buffer *cmd_buffer, uint32_t gds_offset, uint64_t va)
1726 {
1727    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1728 
1729    /* Make sure GDS is idle before copying the value. */
1730    cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2;
1731    si_emit_cache_flush(cmd_buffer);
1732 
1733    radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1734    radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
1735                    COPY_DATA_WR_CONFIRM);
1736    radeon_emit(cs, gds_offset);
1737    radeon_emit(cs, 0);
1738    radeon_emit(cs, va);
1739    radeon_emit(cs, va >> 32);
1740 }
1741 
1742 static void
emit_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,VkQueryType query_type,VkQueryControlFlags flags,uint32_t index)1743 emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va,
1744                  VkQueryType query_type, VkQueryControlFlags flags, uint32_t index)
1745 {
1746    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1747    switch (query_type) {
1748    case VK_QUERY_TYPE_OCCLUSION:
1749       radeon_check_space(cmd_buffer->device->ws, cs, 7);
1750 
1751       ++cmd_buffer->state.active_occlusion_queries;
1752       if (cmd_buffer->state.active_occlusion_queries == 1) {
1753          if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
1754             /* This is the first occlusion query, enable
1755              * the hint if the precision bit is set.
1756              */
1757             cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1758          }
1759 
1760          radv_set_db_count_control(cmd_buffer, true);
1761       } else {
1762          if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) &&
1763              !cmd_buffer->state.perfect_occlusion_queries_enabled) {
1764             /* This is not the first query, but this one
1765              * needs to enable precision, DB_COUNT_CONTROL
1766              * has to be updated accordingly.
1767              */
1768             cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1769 
1770             radv_set_db_count_control(cmd_buffer, true);
1771          }
1772       }
1773 
1774       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1775          uint64_t rb_mask =
1776             BITFIELD64_MASK(cmd_buffer->device->physical_device->rad_info.max_render_backends);
1777 
1778          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1779          radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1));
1780          radeon_emit(cs, PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) |
1781                          PIXEL_PIPE_STATE_CNTL_STRIDE(2) |
1782                          PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask));
1783          radeon_emit(cs, PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask));
1784       }
1785 
1786       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1787 
1788       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1789          radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
1790       } else {
1791          radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1792       }
1793 
1794       radeon_emit(cs, va);
1795       radeon_emit(cs, va >> 32);
1796       break;
1797    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1798       radeon_check_space(cmd_buffer->device->ws, cs, 4);
1799 
1800       ++cmd_buffer->state.active_pipeline_queries;
1801       if (cmd_buffer->state.active_pipeline_queries == 1) {
1802          cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1803          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
1804       }
1805 
1806       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1807       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1808       radeon_emit(cs, va);
1809       radeon_emit(cs, va >> 32);
1810 
1811       if (pool->uses_gds) {
1812          va += pipelinestat_block_size * 2;
1813 
1814          gfx10_copy_gds_query(cmd_buffer, 0, va); /* NGG GS */
1815 
1816          /* Record that the command buffer needs GDS. */
1817          cmd_buffer->gds_needed = true;
1818 
1819          cmd_buffer->state.active_pipeline_gds_queries++;
1820       }
1821       break;
1822    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1823       emit_sample_streamout(cmd_buffer, va, index);
1824       break;
1825    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1826       if (!cmd_buffer->state.prims_gen_query_enabled) {
1827          bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1828 
1829          cmd_buffer->state.prims_gen_query_enabled = true;
1830 
1831          if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1832             radv_emit_streamout_enable(cmd_buffer);
1833          }
1834       }
1835 
1836       emit_sample_streamout(cmd_buffer, va, index);
1837 
1838       if (pool->uses_gds) {
1839          gfx10_copy_gds_query(cmd_buffer, 0, va + 32); /* NGG GS */
1840          gfx10_copy_gds_query(cmd_buffer, 4, va + 48); /* NGG VS/TES */
1841 
1842          /* Record that the command buffer needs GDS. */
1843          cmd_buffer->gds_needed = true;
1844 
1845          cmd_buffer->state.active_pipeline_gds_queries++;
1846       }
1847       break;
1848    }
1849    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1850       radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
1851       break;
1852    }
1853    default:
1854       unreachable("beginning unhandled query type");
1855    }
1856 }
1857 
1858 static void
emit_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va,VkQueryType query_type,uint32_t index)1859 emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, uint64_t va,
1860                uint64_t avail_va, VkQueryType query_type, uint32_t index)
1861 {
1862    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1863    switch (query_type) {
1864    case VK_QUERY_TYPE_OCCLUSION:
1865       radeon_check_space(cmd_buffer->device->ws, cs, 14);
1866 
1867       cmd_buffer->state.active_occlusion_queries--;
1868       if (cmd_buffer->state.active_occlusion_queries == 0) {
1869          radv_set_db_count_control(cmd_buffer, false);
1870 
1871          /* Reset the perfect occlusion queries hint now that no
1872           * queries are active.
1873           */
1874          cmd_buffer->state.perfect_occlusion_queries_enabled = false;
1875       }
1876 
1877       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1878       if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) {
1879          radeon_emit(cs, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1));
1880       } else {
1881          radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1882       }
1883       radeon_emit(cs, va + 8);
1884       radeon_emit(cs, (va + 8) >> 32);
1885 
1886       break;
1887    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1888       radeon_check_space(cmd_buffer->device->ws, cs, 16);
1889 
1890       cmd_buffer->state.active_pipeline_queries--;
1891       if (cmd_buffer->state.active_pipeline_queries == 0) {
1892          cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1893          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1894       }
1895       va += pipelinestat_block_size;
1896 
1897       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1898       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1899       radeon_emit(cs, va);
1900       radeon_emit(cs, va >> 32);
1901 
1902       si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
1903                                  radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
1904                                  0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1,
1905                                  cmd_buffer->gfx9_eop_bug_va);
1906 
1907       if (pool->uses_gds) {
1908          va += pipelinestat_block_size + 8;
1909 
1910          gfx10_copy_gds_query(cmd_buffer, 0, va); /* NGG GS */
1911 
1912          cmd_buffer->state.active_pipeline_gds_queries--;
1913       }
1914       break;
1915    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1916       emit_sample_streamout(cmd_buffer, va + 16, index);
1917       break;
1918    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
1919       if (cmd_buffer->state.prims_gen_query_enabled) {
1920          bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
1921 
1922          cmd_buffer->state.prims_gen_query_enabled = false;
1923 
1924          if (old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) {
1925             radv_emit_streamout_enable(cmd_buffer);
1926          }
1927       }
1928 
1929       emit_sample_streamout(cmd_buffer, va + 16, index);
1930 
1931       if (pool->uses_gds) {
1932          gfx10_copy_gds_query(cmd_buffer, 0, va + 40); /* NGG GS */
1933          gfx10_copy_gds_query(cmd_buffer, 4, va + 56); /* NGG VS/TES */
1934 
1935          cmd_buffer->state.active_pipeline_gds_queries--;
1936       }
1937       break;
1938    }
1939    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1940       radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
1941       break;
1942    }
1943    default:
1944       unreachable("ending unhandled query type");
1945    }
1946 
1947    cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1948                                           RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 |
1949                                           RADV_CMD_FLAG_INV_VCACHE;
1950    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
1951       cmd_buffer->active_query_flush_bits |=
1952          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
1953    }
1954 }
1955 
1956 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)1957 radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
1958                              VkQueryControlFlags flags, uint32_t index)
1959 {
1960    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1961    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1962    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1963    uint64_t va = radv_buffer_get_va(pool->bo);
1964 
1965    radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1966 
1967    emit_query_flush(cmd_buffer, pool);
1968 
1969    va += pool->stride * query;
1970 
1971    emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
1972 }
1973 
1974 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1975 radv_CmdBeginQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
1976                    VkQueryControlFlags flags)
1977 {
1978    radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0);
1979 }
1980 
1981 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1982 radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query,
1983                            uint32_t index)
1984 {
1985    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1986    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1987    uint64_t va = radv_buffer_get_va(pool->bo);
1988    uint64_t avail_va = va + pool->availability_offset + 4 * query;
1989    va += pool->stride * query;
1990 
1991    /* Do not need to add the pool BO to the list because the query must
1992     * currently be active, which means the BO is already in the list.
1993     */
1994    emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
1995 
1996    /*
1997     * For multiview we have to emit a query for each bit in the mask,
1998     * however the first query we emit will get the totals for all the
1999     * operations, so we don't want to get a real value in the other
2000     * queries. This emits a fake begin/end sequence so the waiting
2001     * code gets a completed query value and doesn't hang, but the
2002     * query returns 0.
2003     */
2004    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
2005       for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
2006          va += pool->stride;
2007          avail_va += 4;
2008          emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
2009          emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
2010       }
2011    }
2012 }
2013 
2014 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)2015 radv_CmdEndQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query)
2016 {
2017    radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0);
2018 }
2019 
2020 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)2021 radv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage,
2022                         VkQueryPool queryPool, uint32_t query)
2023 {
2024    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2025    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2026    bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
2027    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2028    uint64_t va = radv_buffer_get_va(pool->bo);
2029    uint64_t query_va = va + pool->stride * query;
2030 
2031    radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2032 
2033    emit_query_flush(cmd_buffer, pool);
2034 
2035    int num_queries = 1;
2036    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
2037       num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask);
2038 
2039    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
2040 
2041    for (unsigned i = 0; i < num_queries; i++) {
2042       if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
2043          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2044          radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
2045                             COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) | COPY_DATA_DST_SEL(V_370_MEM));
2046          radeon_emit(cs, 0);
2047          radeon_emit(cs, 0);
2048          radeon_emit(cs, query_va);
2049          radeon_emit(cs, query_va >> 32);
2050       } else {
2051          si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
2052                                     mec, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
2053                                     EOP_DATA_SEL_TIMESTAMP, query_va, 0,
2054                                     cmd_buffer->gfx9_eop_bug_va);
2055       }
2056       query_va += pool->stride;
2057    }
2058 
2059    cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
2060                                           RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 |
2061                                           RADV_CMD_FLAG_INV_VCACHE;
2062    if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX9) {
2063       cmd_buffer->active_query_flush_bits |=
2064          RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2065    }
2066 
2067    assert(cmd_buffer->cs->cdw <= cdw_max);
2068 }
2069 
2070 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,uint32_t accelerationStructureCount,const VkAccelerationStructureKHR * pAccelerationStructures,VkQueryType queryType,VkQueryPool queryPool,uint32_t firstQuery)2071 radv_CmdWriteAccelerationStructuresPropertiesKHR(
2072    VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount,
2073    const VkAccelerationStructureKHR *pAccelerationStructures, VkQueryType queryType,
2074    VkQueryPool queryPool, uint32_t firstQuery)
2075 {
2076    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2077    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2078    struct radeon_cmdbuf *cs = cmd_buffer->cs;
2079    uint64_t pool_va = radv_buffer_get_va(pool->bo);
2080    uint64_t query_va = pool_va + pool->stride * firstQuery;
2081 
2082    radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2083 
2084    emit_query_flush(cmd_buffer, pool);
2085 
2086    ASSERTED unsigned cdw_max =
2087       radeon_check_space(cmd_buffer->device->ws, cs, 6 * accelerationStructureCount);
2088 
2089    for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
2090       RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, pAccelerationStructures[i]);
2091       uint64_t va = radv_accel_struct_get_va(accel_struct);
2092 
2093       switch (queryType) {
2094       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
2095          va += offsetof(struct radv_accel_struct_header, compacted_size);
2096          break;
2097       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
2098          va += offsetof(struct radv_accel_struct_header, serialization_size);
2099          break;
2100       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
2101          va += offsetof(struct radv_accel_struct_header, instance_count);
2102          break;
2103       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
2104          va += offsetof(struct radv_accel_struct_header, size);
2105          break;
2106       default:
2107          unreachable("Unhandle accel struct query type.");
2108       }
2109 
2110       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2111       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
2112                          COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
2113       radeon_emit(cs, va);
2114       radeon_emit(cs, va >> 32);
2115       radeon_emit(cs, query_va);
2116       radeon_emit(cs, query_va >> 32);
2117 
2118       query_va += pool->stride;
2119    }
2120 
2121    assert(cmd_buffer->cs->cdw <= cdw_max);
2122 }
2123