1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26 #include <assert.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <fcntl.h>
31
32 #include "nir/nir_builder.h"
33 #include "radv_meta.h"
34 #include "radv_private.h"
35 #include "radv_cs.h"
36 #include "sid.h"
37 #include "util/u_atomic.h"
38
39 #define TIMESTAMP_NOT_READY UINT64_MAX
40
41 static const int pipelinestat_block_size = 11 * 8;
42 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
43
44 static unsigned
radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag)45 radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag)
46 {
47 int offset = ffs(flag) - 1;
48 assert(offset < ARRAY_SIZE(pipeline_statistics_indices));
49 return pipeline_statistics_indices[offset];
50 }
51
nir_test_flag(nir_builder * b,nir_ssa_def * flags,uint32_t flag)52 static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag)
53 {
54 return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag)));
55 }
56
radv_break_on_count(nir_builder * b,nir_variable * var,nir_ssa_def * count)57 static void radv_break_on_count(nir_builder *b, nir_variable *var, nir_ssa_def *count)
58 {
59 nir_ssa_def *counter = nir_load_var(b, var);
60
61 nir_push_if(b, nir_uge(b, counter, count));
62 nir_jump(b, nir_jump_break);
63 nir_pop_if(b, NULL);
64
65 counter = nir_iadd(b, counter, nir_imm_int(b, 1));
66 nir_store_var(b, var, counter, 0x1);
67 }
68
69 static struct nir_ssa_def *
radv_load_push_int(nir_builder * b,unsigned offset,const char * name)70 radv_load_push_int(nir_builder *b, unsigned offset, const char *name)
71 {
72 nir_intrinsic_instr *flags = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
73 nir_intrinsic_set_base(flags, 0);
74 nir_intrinsic_set_range(flags, 16);
75 flags->src[0] = nir_src_for_ssa(nir_imm_int(b, offset));
76 flags->num_components = 1;
77 nir_ssa_dest_init(&flags->instr, &flags->dest, 1, 32, name);
78 nir_builder_instr_insert(b, &flags->instr);
79 return &flags->dest.ssa;
80 }
81
82 static void
radv_store_availability(nir_builder * b,nir_ssa_def * flags,nir_ssa_def * dst_buf,nir_ssa_def * offset,nir_ssa_def * value32)83 radv_store_availability(nir_builder *b, nir_ssa_def *flags, nir_ssa_def *dst_buf,
84 nir_ssa_def *offset, nir_ssa_def *value32)
85 {
86 nir_push_if(b, nir_test_flag(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
87
88 nir_push_if(b, nir_test_flag(b, flags, VK_QUERY_RESULT_64_BIT));
89
90 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo);
91 store->src[0] = nir_src_for_ssa(nir_vec2(b, value32, nir_imm_int(b, 0)));
92 store->src[1] = nir_src_for_ssa(dst_buf);
93 store->src[2] = nir_src_for_ssa(offset);
94 nir_intrinsic_set_write_mask(store, 0x3);
95 nir_intrinsic_set_align(store, 8, 0);
96 store->num_components = 2;
97 nir_builder_instr_insert(b, &store->instr);
98
99 nir_push_else(b, NULL);
100
101 store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo);
102 store->src[0] = nir_src_for_ssa(value32);
103 store->src[1] = nir_src_for_ssa(dst_buf);
104 store->src[2] = nir_src_for_ssa(offset);
105 nir_intrinsic_set_write_mask(store, 0x1);
106 nir_intrinsic_set_align(store, 4, 0);
107 store->num_components = 1;
108 nir_builder_instr_insert(b, &store->instr);
109
110 nir_pop_if(b, NULL);
111
112 nir_pop_if(b, NULL);
113 }
114
115 static nir_shader *
build_occlusion_query_shader(struct radv_device * device)116 build_occlusion_query_shader(struct radv_device *device) {
117 /* the shader this builds is roughly
118 *
119 * push constants {
120 * uint32_t flags;
121 * uint32_t dst_stride;
122 * };
123 *
124 * uint32_t src_stride = 16 * db_count;
125 *
126 * location(binding = 0) buffer dst_buf;
127 * location(binding = 1) buffer src_buf;
128 *
129 * void main() {
130 * uint64_t result = 0;
131 * uint64_t src_offset = src_stride * global_id.x;
132 * uint64_t dst_offset = dst_stride * global_id.x;
133 * bool available = true;
134 * for (int i = 0; i < db_count; ++i) {
135 * if (enabled_rb_mask & (1 << i)) {
136 * uint64_t start = src_buf[src_offset + 16 * i];
137 * uint64_t end = src_buf[src_offset + 16 * i + 8];
138 * if ((start & (1ull << 63)) && (end & (1ull << 63)))
139 * result += end - start;
140 * else
141 * available = false;
142 * }
143 * }
144 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
145 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
146 * if (flags & VK_QUERY_RESULT_64_BIT)
147 * dst_buf[dst_offset] = result;
148 * else
149 * dst_buf[dst_offset] = (uint32_t)result.
150 * }
151 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
152 * dst_buf[dst_offset + elem_size] = available;
153 * }
154 * }
155 */
156 nir_builder b;
157 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
158 b.shader->info.name = ralloc_strdup(b.shader, "occlusion_query");
159 b.shader->info.cs.local_size[0] = 64;
160 b.shader->info.cs.local_size[1] = 1;
161 b.shader->info.cs.local_size[2] = 1;
162
163 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
164 nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
165 nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
166 nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
167 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
168 unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
169 unsigned db_count = device->physical_device->rad_info.num_render_backends;
170
171 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
172
173 nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
174 nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
175
176 nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
177 nir_ssa_def *wg_id = nir_load_work_group_id(&b, 32);
178 nir_ssa_def *block_size = nir_imm_ivec4(&b,
179 b.shader->info.cs.local_size[0],
180 b.shader->info.cs.local_size[1],
181 b.shader->info.cs.local_size[2], 0);
182 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
183 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
184
185 nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
186 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
187 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
188 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
189
190
191 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
192 nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
193 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
194
195 nir_push_loop(&b);
196
197 nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
198 radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
199
200 nir_ssa_def *enabled_cond =
201 nir_iand(&b, nir_imm_int(&b, enabled_rb_mask),
202 nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count));
203
204 nir_push_if(&b, nir_i2b(&b, enabled_cond));
205
206 nir_ssa_def *load_offset = nir_imul(&b, current_outer_count, nir_imm_int(&b, 16));
207 load_offset = nir_iadd(&b, input_base, load_offset);
208
209 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
210 load->src[0] = nir_src_for_ssa(src_buf);
211 load->src[1] = nir_src_for_ssa(load_offset);
212 nir_ssa_dest_init(&load->instr, &load->dest, 2, 64, NULL);
213 load->num_components = 2;
214 nir_intrinsic_set_align(load, 16, 0);
215 nir_builder_instr_insert(&b, &load->instr);
216
217 nir_store_var(&b, start, nir_channel(&b, &load->dest.ssa, 0), 0x1);
218 nir_store_var(&b, end, nir_channel(&b, &load->dest.ssa, 1), 0x1);
219
220 nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
221 nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
222
223 nir_push_if(&b, nir_iand(&b, start_done, end_done));
224
225 nir_store_var(&b, result,
226 nir_iadd(&b, nir_load_var(&b, result),
227 nir_isub(&b, nir_load_var(&b, end),
228 nir_load_var(&b, start))), 0x1);
229
230 nir_push_else(&b, NULL);
231
232 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
233
234 nir_pop_if(&b, NULL);
235 nir_pop_if(&b, NULL);
236 nir_pop_loop(&b, NULL);
237
238 /* Store the result if complete or if partial results have been requested. */
239
240 nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
241 nir_ssa_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
242 nir_push_if(&b,
243 nir_ior(&b,
244 nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
245 nir_load_var(&b, available)));
246
247 nir_push_if(&b, result_is_64bit);
248
249 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
250 store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
251 store->src[1] = nir_src_for_ssa(dst_buf);
252 store->src[2] = nir_src_for_ssa(output_base);
253 nir_intrinsic_set_write_mask(store, 0x1);
254 nir_intrinsic_set_align(store, 8, 0);
255 store->num_components = 1;
256 nir_builder_instr_insert(&b, &store->instr);
257
258 nir_push_else(&b, NULL);
259
260 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
261 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
262 store->src[1] = nir_src_for_ssa(dst_buf);
263 store->src[2] = nir_src_for_ssa(output_base);
264 nir_intrinsic_set_write_mask(store, 0x1);
265 nir_intrinsic_set_align(store, 4, 0);
266 store->num_components = 1;
267 nir_builder_instr_insert(&b, &store->instr);
268
269 nir_pop_if(&b, NULL);
270 nir_pop_if(&b, NULL);
271
272 radv_store_availability(&b, flags, dst_buf,
273 nir_iadd(&b, result_size, output_base),
274 nir_b2i32(&b, nir_load_var(&b, available)));
275
276 return b.shader;
277 }
278
279 static nir_shader *
build_pipeline_statistics_query_shader(struct radv_device * device)280 build_pipeline_statistics_query_shader(struct radv_device *device) {
281 /* the shader this builds is roughly
282 *
283 * push constants {
284 * uint32_t flags;
285 * uint32_t dst_stride;
286 * uint32_t stats_mask;
287 * uint32_t avail_offset;
288 * };
289 *
290 * uint32_t src_stride = pipelinestat_block_size * 2;
291 *
292 * location(binding = 0) buffer dst_buf;
293 * location(binding = 1) buffer src_buf;
294 *
295 * void main() {
296 * uint64_t src_offset = src_stride * global_id.x;
297 * uint64_t dst_base = dst_stride * global_id.x;
298 * uint64_t dst_offset = dst_base;
299 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
300 * uint32_t elem_count = stats_mask >> 16;
301 * uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
302 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
303 * dst_buf[dst_offset + elem_count * elem_size] = available32;
304 * }
305 * if ((bool)available32) {
306 * // repeat 11 times:
307 * if (stats_mask & (1 << 0)) {
308 * uint64_t start = src_buf[src_offset + 8 * indices[0]];
309 * uint64_t end = src_buf[src_offset + 8 * indices[0] + pipelinestat_block_size];
310 * uint64_t result = end - start;
311 * if (flags & VK_QUERY_RESULT_64_BIT)
312 * dst_buf[dst_offset] = result;
313 * else
314 * dst_buf[dst_offset] = (uint32_t)result.
315 * dst_offset += elem_size;
316 * }
317 * } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
318 * // Set everything to 0 as we don't know what is valid.
319 * for (int i = 0; i < elem_count; ++i)
320 * dst_buf[dst_base + elem_size * i] = 0;
321 * }
322 * }
323 */
324 nir_builder b;
325 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
326 b.shader->info.name = ralloc_strdup(b.shader, "pipeline_statistics_query");
327 b.shader->info.cs.local_size[0] = 64;
328 b.shader->info.cs.local_size[1] = 1;
329 b.shader->info.cs.local_size[2] = 1;
330
331 nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
332
333 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
334 nir_ssa_def *stats_mask = radv_load_push_int(&b, 8, "stats_mask");
335 nir_ssa_def *avail_offset = radv_load_push_int(&b, 12, "avail_offset");
336
337 nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
338 nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
339
340 nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
341 nir_ssa_def *wg_id = nir_load_work_group_id(&b, 32);
342 nir_ssa_def *block_size = nir_imm_ivec4(&b,
343 b.shader->info.cs.local_size[0],
344 b.shader->info.cs.local_size[1],
345 b.shader->info.cs.local_size[2], 0);
346 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
347 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
348
349 nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
350 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
351 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
352 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
353
354
355 avail_offset = nir_iadd(&b, avail_offset,
356 nir_imul(&b, global_id, nir_imm_int(&b, 4)));
357
358 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
359 load->src[0] = nir_src_for_ssa(src_buf);
360 load->src[1] = nir_src_for_ssa(avail_offset);
361 nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
362 load->num_components = 1;
363 nir_intrinsic_set_align(load, 4, 0);
364 nir_builder_instr_insert(&b, &load->instr);
365 nir_ssa_def *available32 = &load->dest.ssa;
366
367 nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
368 nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
369 nir_ssa_def *elem_count = nir_ushr(&b, stats_mask, nir_imm_int(&b, 16));
370
371 radv_store_availability(&b, flags, dst_buf,
372 nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
373 available32);
374
375 nir_push_if(&b, nir_i2b(&b, available32));
376
377 nir_store_var(&b, output_offset, output_base, 0x1);
378 for (int i = 0; i < 11; ++i) {
379 nir_push_if(&b, nir_test_flag(&b, stats_mask, 1u << i));
380
381 load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
382 load->src[0] = nir_src_for_ssa(src_buf);
383 load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
384 nir_imm_int(&b, pipeline_statistics_indices[i] * 8)));
385 nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
386 load->num_components = 1;
387 nir_intrinsic_set_align(load, 8, 0);
388 nir_builder_instr_insert(&b, &load->instr);
389 nir_ssa_def *start = &load->dest.ssa;
390
391 load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
392 load->src[0] = nir_src_for_ssa(src_buf);
393 load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
394 nir_imm_int(&b, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size)));
395 nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
396 load->num_components = 1;
397 nir_intrinsic_set_align(load, 8, 0);
398 nir_builder_instr_insert(&b, &load->instr);
399 nir_ssa_def *end = &load->dest.ssa;
400
401 nir_ssa_def *result = nir_isub(&b, end, start);
402
403 /* Store result */
404 nir_push_if(&b, result_is_64bit);
405
406 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
407 store->src[0] = nir_src_for_ssa(result);
408 store->src[1] = nir_src_for_ssa(dst_buf);
409 store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
410 nir_intrinsic_set_write_mask(store, 0x1);
411 nir_intrinsic_set_align(store, 8, 0);
412 store->num_components = 1;
413 nir_builder_instr_insert(&b, &store->instr);
414
415 nir_push_else(&b, NULL);
416
417 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
418 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, result));
419 store->src[1] = nir_src_for_ssa(dst_buf);
420 store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
421 nir_intrinsic_set_write_mask(store, 0x1);
422 nir_intrinsic_set_align(store, 4, 0);
423 store->num_components = 1;
424 nir_builder_instr_insert(&b, &store->instr);
425
426 nir_pop_if(&b, NULL);
427
428 nir_store_var(&b, output_offset,
429 nir_iadd(&b, nir_load_var(&b, output_offset),
430 elem_size), 0x1);
431
432 nir_pop_if(&b, NULL);
433 }
434
435 nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
436
437 nir_push_if(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
438
439 /* Stores zeros in all outputs. */
440
441 nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
442 nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
443
444 nir_loop *loop = nir_push_loop(&b);
445
446 nir_ssa_def *current_counter = nir_load_var(&b, counter);
447 radv_break_on_count(&b, counter, elem_count);
448
449 nir_ssa_def *output_elem = nir_iadd(&b, output_base,
450 nir_imul(&b, elem_size, current_counter));
451 nir_push_if(&b, result_is_64bit);
452
453 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
454 store->src[0] = nir_src_for_ssa(nir_imm_int64(&b, 0));
455 store->src[1] = nir_src_for_ssa(dst_buf);
456 store->src[2] = nir_src_for_ssa(output_elem);
457 nir_intrinsic_set_write_mask(store, 0x1);
458 nir_intrinsic_set_align(store, 8, 0);
459 store->num_components = 1;
460 nir_builder_instr_insert(&b, &store->instr);
461
462 nir_push_else(&b, NULL);
463
464 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
465 store->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
466 store->src[1] = nir_src_for_ssa(dst_buf);
467 store->src[2] = nir_src_for_ssa(output_elem);
468 nir_intrinsic_set_write_mask(store, 0x1);
469 nir_intrinsic_set_align(store, 4, 0);
470 store->num_components = 1;
471 nir_builder_instr_insert(&b, &store->instr);
472
473 nir_pop_if(&b, NULL);
474
475 nir_pop_loop(&b, loop);
476 nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
477 nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
478 return b.shader;
479 }
480
481 static nir_shader *
build_tfb_query_shader(struct radv_device * device)482 build_tfb_query_shader(struct radv_device *device)
483 {
484 /* the shader this builds is roughly
485 *
486 * uint32_t src_stride = 32;
487 *
488 * location(binding = 0) buffer dst_buf;
489 * location(binding = 1) buffer src_buf;
490 *
491 * void main() {
492 * uint64_t result[2] = {};
493 * bool available = false;
494 * uint64_t src_offset = src_stride * global_id.x;
495 * uint64_t dst_offset = dst_stride * global_id.x;
496 * uint64_t *src_data = src_buf[src_offset];
497 * uint32_t avail = (src_data[0] >> 32) &
498 * (src_data[1] >> 32) &
499 * (src_data[2] >> 32) &
500 * (src_data[3] >> 32);
501 * if (avail & 0x80000000) {
502 * result[0] = src_data[3] - src_data[1];
503 * result[1] = src_data[2] - src_data[0];
504 * available = true;
505 * }
506 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
507 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
508 * if (flags & VK_QUERY_RESULT_64_BIT) {
509 * dst_buf[dst_offset] = result;
510 * } else {
511 * dst_buf[dst_offset] = (uint32_t)result;
512 * }
513 * }
514 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
515 * dst_buf[dst_offset + result_size] = available;
516 * }
517 * }
518 */
519 nir_builder b;
520 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
521 b.shader->info.name = ralloc_strdup(b.shader, "tfb_query");
522 b.shader->info.cs.local_size[0] = 64;
523 b.shader->info.cs.local_size[1] = 1;
524 b.shader->info.cs.local_size[2] = 1;
525
526 /* Create and initialize local variables. */
527 nir_variable *result =
528 nir_local_variable_create(b.impl,
529 glsl_vector_type(GLSL_TYPE_UINT64, 2),
530 "result");
531 nir_variable *available =
532 nir_local_variable_create(b.impl, glsl_bool_type(), "available");
533
534 nir_store_var(&b, result,
535 nir_vec2(&b, nir_imm_int64(&b, 0),
536 nir_imm_int64(&b, 0)), 0x3);
537 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
538
539 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
540
541 /* Load resources. */
542 nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
543 nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
544
545 /* Compute global ID. */
546 nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
547 nir_ssa_def *wg_id = nir_load_work_group_id(&b, 32);
548 nir_ssa_def *block_size = nir_imm_ivec4(&b,
549 b.shader->info.cs.local_size[0],
550 b.shader->info.cs.local_size[1],
551 b.shader->info.cs.local_size[2], 0);
552 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
553 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
554
555 /* Compute src/dst strides. */
556 nir_ssa_def *input_stride = nir_imm_int(&b, 32);
557 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
558 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
559 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
560
561 /* Load data from the query pool. */
562 nir_intrinsic_instr *load1 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
563 load1->src[0] = nir_src_for_ssa(src_buf);
564 load1->src[1] = nir_src_for_ssa(input_base);
565 nir_ssa_dest_init(&load1->instr, &load1->dest, 4, 32, NULL);
566 load1->num_components = 4;
567 nir_intrinsic_set_align(load1, 32, 0);
568 nir_builder_instr_insert(&b, &load1->instr);
569
570 nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
571 load2->src[0] = nir_src_for_ssa(src_buf);
572 load2->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base, nir_imm_int(&b, 16)));
573 nir_ssa_dest_init(&load2->instr, &load2->dest, 4, 32, NULL);
574 load2->num_components = 4;
575 nir_intrinsic_set_align(load2, 16, 0);
576 nir_builder_instr_insert(&b, &load2->instr);
577
578 /* Check if result is available. */
579 nir_ssa_def *avails[2];
580 avails[0] = nir_iand(&b, nir_channel(&b, &load1->dest.ssa, 1),
581 nir_channel(&b, &load1->dest.ssa, 3));
582 avails[1] = nir_iand(&b, nir_channel(&b, &load2->dest.ssa, 1),
583 nir_channel(&b, &load2->dest.ssa, 3));
584 nir_ssa_def *result_is_available =
585 nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]),
586 nir_imm_int(&b, 0x80000000)));
587
588 /* Only compute result if available. */
589 nir_push_if(&b, result_is_available);
590
591 /* Pack values. */
592 nir_ssa_def *packed64[4];
593 packed64[0] = nir_pack_64_2x32(&b, nir_vec2(&b,
594 nir_channel(&b, &load1->dest.ssa, 0),
595 nir_channel(&b, &load1->dest.ssa, 1)));
596 packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b,
597 nir_channel(&b, &load1->dest.ssa, 2),
598 nir_channel(&b, &load1->dest.ssa, 3)));
599 packed64[2] = nir_pack_64_2x32(&b, nir_vec2(&b,
600 nir_channel(&b, &load2->dest.ssa, 0),
601 nir_channel(&b, &load2->dest.ssa, 1)));
602 packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b,
603 nir_channel(&b, &load2->dest.ssa, 2),
604 nir_channel(&b, &load2->dest.ssa, 3)));
605
606 /* Compute result. */
607 nir_ssa_def *num_primitive_written =
608 nir_isub(&b, packed64[3], packed64[1]);
609 nir_ssa_def *primitive_storage_needed =
610 nir_isub(&b, packed64[2], packed64[0]);
611
612 nir_store_var(&b, result,
613 nir_vec2(&b, num_primitive_written,
614 primitive_storage_needed), 0x3);
615 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
616
617 nir_pop_if(&b, NULL);
618
619 /* Determine if result is 64 or 32 bit. */
620 nir_ssa_def *result_is_64bit =
621 nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
622 nir_ssa_def *result_size =
623 nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16),
624 nir_imm_int(&b, 8));
625
626 /* Store the result if complete or partial results have been requested. */
627 nir_push_if(&b,
628 nir_ior(&b,
629 nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
630 nir_load_var(&b, available)));
631
632 /* Store result. */
633 nir_push_if(&b, result_is_64bit);
634
635 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
636 store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
637 store->src[1] = nir_src_for_ssa(dst_buf);
638 store->src[2] = nir_src_for_ssa(output_base);
639 nir_intrinsic_set_write_mask(store, 0x3);
640 nir_intrinsic_set_align(store, 8, 0);
641 store->num_components = 2;
642 nir_builder_instr_insert(&b, &store->instr);
643
644 nir_push_else(&b, NULL);
645
646 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
647 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
648 store->src[1] = nir_src_for_ssa(dst_buf);
649 store->src[2] = nir_src_for_ssa(output_base);
650 nir_intrinsic_set_write_mask(store, 0x3);
651 nir_intrinsic_set_align(store, 4, 0);
652 store->num_components = 2;
653 nir_builder_instr_insert(&b, &store->instr);
654
655 nir_pop_if(&b, NULL);
656 nir_pop_if(&b, NULL);
657
658 radv_store_availability(&b, flags, dst_buf,
659 nir_iadd(&b, result_size, output_base),
660 nir_b2i32(&b, nir_load_var(&b, available)));
661
662 return b.shader;
663 }
664
665 static nir_shader *
build_timestamp_query_shader(struct radv_device * device)666 build_timestamp_query_shader(struct radv_device *device)
667 {
668 /* the shader this builds is roughly
669 *
670 * uint32_t src_stride = 8;
671 *
672 * location(binding = 0) buffer dst_buf;
673 * location(binding = 1) buffer src_buf;
674 *
675 * void main() {
676 * uint64_t result = 0;
677 * bool available = false;
678 * uint64_t src_offset = src_stride * global_id.x;
679 * uint64_t dst_offset = dst_stride * global_id.x;
680 * uint64_t timestamp = src_buf[src_offset];
681 * if (timestamp != TIMESTAMP_NOT_READY) {
682 * result = timestamp;
683 * available = true;
684 * }
685 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
686 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
687 * if (flags & VK_QUERY_RESULT_64_BIT) {
688 * dst_buf[dst_offset] = result;
689 * } else {
690 * dst_buf[dst_offset] = (uint32_t)result;
691 * }
692 * }
693 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
694 * dst_buf[dst_offset + result_size] = available;
695 * }
696 * }
697 */
698 nir_builder b;
699 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
700 b.shader->info.name = ralloc_strdup(b.shader, "timestamp_query");
701 b.shader->info.cs.local_size[0] = 64;
702 b.shader->info.cs.local_size[1] = 1;
703 b.shader->info.cs.local_size[2] = 1;
704
705 /* Create and initialize local variables. */
706 nir_variable *result =
707 nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
708 nir_variable *available =
709 nir_local_variable_create(b.impl, glsl_bool_type(), "available");
710
711 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
712 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
713
714 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
715
716 /* Load resources. */
717 nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
718 nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
719
720 /* Compute global ID. */
721 nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
722 nir_ssa_def *wg_id = nir_load_work_group_id(&b, 32);
723 nir_ssa_def *block_size = nir_imm_ivec4(&b,
724 b.shader->info.cs.local_size[0],
725 b.shader->info.cs.local_size[1],
726 b.shader->info.cs.local_size[2], 0);
727 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
728 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
729
730 /* Compute src/dst strides. */
731 nir_ssa_def *input_stride = nir_imm_int(&b, 8);
732 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
733 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
734 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
735
736 /* Load data from the query pool. */
737 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
738 load->src[0] = nir_src_for_ssa(src_buf);
739 load->src[1] = nir_src_for_ssa(input_base);
740 nir_ssa_dest_init(&load->instr, &load->dest, 2, 32, NULL);
741 load->num_components = 2;
742 nir_intrinsic_set_align(load, 8, 0);
743 nir_builder_instr_insert(&b, &load->instr);
744
745 /* Pack the timestamp. */
746 nir_ssa_def *timestamp;
747 timestamp = nir_pack_64_2x32(&b, nir_vec2(&b,
748 nir_channel(&b, &load->dest.ssa, 0),
749 nir_channel(&b, &load->dest.ssa, 1)));
750
751 /* Check if result is available. */
752 nir_ssa_def *result_is_available =
753 nir_i2b(&b, nir_ine(&b, timestamp,
754 nir_imm_int64(&b, TIMESTAMP_NOT_READY)));
755
756 /* Only store result if available. */
757 nir_push_if(&b, result_is_available);
758
759 nir_store_var(&b, result, timestamp, 0x1);
760 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
761
762 nir_pop_if(&b, NULL);
763
764 /* Determine if result is 64 or 32 bit. */
765 nir_ssa_def *result_is_64bit =
766 nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
767 nir_ssa_def *result_size =
768 nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8),
769 nir_imm_int(&b, 4));
770
771 /* Store the result if complete or partial results have been requested. */
772 nir_push_if(&b, nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
773 nir_load_var(&b, available)));
774
775 /* Store result. */
776 nir_push_if(&b, result_is_64bit);
777
778 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
779 store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
780 store->src[1] = nir_src_for_ssa(dst_buf);
781 store->src[2] = nir_src_for_ssa(output_base);
782 nir_intrinsic_set_write_mask(store, 0x1);
783 nir_intrinsic_set_align(store, 8, 0);
784 store->num_components = 1;
785 nir_builder_instr_insert(&b, &store->instr);
786
787 nir_push_else(&b, NULL);
788
789 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
790 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
791 store->src[1] = nir_src_for_ssa(dst_buf);
792 store->src[2] = nir_src_for_ssa(output_base);
793 nir_intrinsic_set_write_mask(store, 0x1);
794 nir_intrinsic_set_align(store, 4, 0);
795 store->num_components = 1;
796 nir_builder_instr_insert(&b, &store->instr);
797
798 nir_pop_if(&b, NULL);
799
800 nir_pop_if(&b, NULL);
801
802 radv_store_availability(&b, flags, dst_buf,
803 nir_iadd(&b, result_size, output_base),
804 nir_b2i32(&b, nir_load_var(&b, available)));
805
806 return b.shader;
807 }
808
radv_device_init_meta_query_state_internal(struct radv_device * device)809 static VkResult radv_device_init_meta_query_state_internal(struct radv_device *device)
810 {
811 VkResult result;
812 struct radv_shader_module occlusion_cs = { .nir = NULL };
813 struct radv_shader_module pipeline_statistics_cs = { .nir = NULL };
814 struct radv_shader_module tfb_cs = { .nir = NULL };
815 struct radv_shader_module timestamp_cs = { .nir = NULL };
816
817 mtx_lock(&device->meta_state.mtx);
818 if (device->meta_state.query.pipeline_statistics_query_pipeline) {
819 mtx_unlock(&device->meta_state.mtx);
820 return VK_SUCCESS;
821 }
822 occlusion_cs.nir = build_occlusion_query_shader(device);
823 pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device);
824 tfb_cs.nir = build_tfb_query_shader(device);
825 timestamp_cs.nir = build_timestamp_query_shader(device);
826
827 VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
828 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
829 .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
830 .bindingCount = 2,
831 .pBindings = (VkDescriptorSetLayoutBinding[]) {
832 {
833 .binding = 0,
834 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
835 .descriptorCount = 1,
836 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
837 .pImmutableSamplers = NULL
838 },
839 {
840 .binding = 1,
841 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
842 .descriptorCount = 1,
843 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
844 .pImmutableSamplers = NULL
845 },
846 }
847 };
848
849 result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
850 &occlusion_ds_create_info,
851 &device->meta_state.alloc,
852 &device->meta_state.query.ds_layout);
853 if (result != VK_SUCCESS)
854 goto fail;
855
856 VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
857 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
858 .setLayoutCount = 1,
859 .pSetLayouts = &device->meta_state.query.ds_layout,
860 .pushConstantRangeCount = 1,
861 .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16},
862 };
863
864 result = radv_CreatePipelineLayout(radv_device_to_handle(device),
865 &occlusion_pl_create_info,
866 &device->meta_state.alloc,
867 &device->meta_state.query.p_layout);
868 if (result != VK_SUCCESS)
869 goto fail;
870
871 VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
872 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
873 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
874 .module = radv_shader_module_to_handle(&occlusion_cs),
875 .pName = "main",
876 .pSpecializationInfo = NULL,
877 };
878
879 VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
880 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
881 .stage = occlusion_pipeline_shader_stage,
882 .flags = 0,
883 .layout = device->meta_state.query.p_layout,
884 };
885
886 result = radv_CreateComputePipelines(radv_device_to_handle(device),
887 radv_pipeline_cache_to_handle(&device->meta_state.cache),
888 1, &occlusion_vk_pipeline_info, NULL,
889 &device->meta_state.query.occlusion_query_pipeline);
890 if (result != VK_SUCCESS)
891 goto fail;
892
893 VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
894 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
895 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
896 .module = radv_shader_module_to_handle(&pipeline_statistics_cs),
897 .pName = "main",
898 .pSpecializationInfo = NULL,
899 };
900
901 VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
902 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
903 .stage = pipeline_statistics_pipeline_shader_stage,
904 .flags = 0,
905 .layout = device->meta_state.query.p_layout,
906 };
907
908 result = radv_CreateComputePipelines(radv_device_to_handle(device),
909 radv_pipeline_cache_to_handle(&device->meta_state.cache),
910 1, &pipeline_statistics_vk_pipeline_info, NULL,
911 &device->meta_state.query.pipeline_statistics_query_pipeline);
912 if (result != VK_SUCCESS)
913 goto fail;
914
915 VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
916 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
917 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
918 .module = radv_shader_module_to_handle(&tfb_cs),
919 .pName = "main",
920 .pSpecializationInfo = NULL,
921 };
922
923 VkComputePipelineCreateInfo tfb_pipeline_info = {
924 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
925 .stage = tfb_pipeline_shader_stage,
926 .flags = 0,
927 .layout = device->meta_state.query.p_layout,
928 };
929
930 result = radv_CreateComputePipelines(radv_device_to_handle(device),
931 radv_pipeline_cache_to_handle(&device->meta_state.cache),
932 1, &tfb_pipeline_info, NULL,
933 &device->meta_state.query.tfb_query_pipeline);
934 if (result != VK_SUCCESS)
935 goto fail;
936
937 VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = {
938 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
939 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
940 .module = radv_shader_module_to_handle(×tamp_cs),
941 .pName = "main",
942 .pSpecializationInfo = NULL,
943 };
944
945 VkComputePipelineCreateInfo timestamp_pipeline_info = {
946 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
947 .stage = timestamp_pipeline_shader_stage,
948 .flags = 0,
949 .layout = device->meta_state.query.p_layout,
950 };
951
952 result = radv_CreateComputePipelines(radv_device_to_handle(device),
953 radv_pipeline_cache_to_handle(&device->meta_state.cache),
954 1, ×tamp_pipeline_info, NULL,
955 &device->meta_state.query.timestamp_query_pipeline);
956
957 fail:
958 if (result != VK_SUCCESS)
959 radv_device_finish_meta_query_state(device);
960 ralloc_free(occlusion_cs.nir);
961 ralloc_free(pipeline_statistics_cs.nir);
962 ralloc_free(tfb_cs.nir);
963 ralloc_free(timestamp_cs.nir);
964 mtx_unlock(&device->meta_state.mtx);
965 return result;
966 }
967
radv_device_init_meta_query_state(struct radv_device * device,bool on_demand)968 VkResult radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
969 {
970 if (on_demand)
971 return VK_SUCCESS;
972
973 return radv_device_init_meta_query_state_internal(device);
974 }
975
radv_device_finish_meta_query_state(struct radv_device * device)976 void radv_device_finish_meta_query_state(struct radv_device *device)
977 {
978 if (device->meta_state.query.tfb_query_pipeline)
979 radv_DestroyPipeline(radv_device_to_handle(device),
980 device->meta_state.query.tfb_query_pipeline,
981 &device->meta_state.alloc);
982
983 if (device->meta_state.query.pipeline_statistics_query_pipeline)
984 radv_DestroyPipeline(radv_device_to_handle(device),
985 device->meta_state.query.pipeline_statistics_query_pipeline,
986 &device->meta_state.alloc);
987
988 if (device->meta_state.query.occlusion_query_pipeline)
989 radv_DestroyPipeline(radv_device_to_handle(device),
990 device->meta_state.query.occlusion_query_pipeline,
991 &device->meta_state.alloc);
992
993 if (device->meta_state.query.timestamp_query_pipeline)
994 radv_DestroyPipeline(radv_device_to_handle(device),
995 device->meta_state.query.timestamp_query_pipeline,
996 &device->meta_state.alloc);
997
998 if (device->meta_state.query.p_layout)
999 radv_DestroyPipelineLayout(radv_device_to_handle(device),
1000 device->meta_state.query.p_layout,
1001 &device->meta_state.alloc);
1002
1003 if (device->meta_state.query.ds_layout)
1004 radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
1005 device->meta_state.query.ds_layout,
1006 &device->meta_state.alloc);
1007 }
1008
radv_query_shader(struct radv_cmd_buffer * cmd_buffer,VkPipeline * pipeline,struct radeon_winsys_bo * src_bo,struct radeon_winsys_bo * dst_bo,uint64_t src_offset,uint64_t dst_offset,uint32_t src_stride,uint32_t dst_stride,uint32_t count,uint32_t flags,uint32_t pipeline_stats_mask,uint32_t avail_offset)1009 static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
1010 VkPipeline *pipeline,
1011 struct radeon_winsys_bo *src_bo,
1012 struct radeon_winsys_bo *dst_bo,
1013 uint64_t src_offset, uint64_t dst_offset,
1014 uint32_t src_stride, uint32_t dst_stride,
1015 uint32_t count, uint32_t flags,
1016 uint32_t pipeline_stats_mask, uint32_t avail_offset)
1017 {
1018 struct radv_device *device = cmd_buffer->device;
1019 struct radv_meta_saved_state saved_state;
1020 bool old_predicating;
1021
1022 if (!*pipeline) {
1023 VkResult ret = radv_device_init_meta_query_state_internal(device);
1024 if (ret != VK_SUCCESS) {
1025 cmd_buffer->record_result = ret;
1026 return;
1027 }
1028 }
1029
1030 radv_meta_save(&saved_state, cmd_buffer,
1031 RADV_META_SAVE_COMPUTE_PIPELINE |
1032 RADV_META_SAVE_CONSTANTS |
1033 RADV_META_SAVE_DESCRIPTORS);
1034
1035 /* VK_EXT_conditional_rendering says that copy commands should not be
1036 * affected by conditional rendering.
1037 */
1038 old_predicating = cmd_buffer->state.predicating;
1039 cmd_buffer->state.predicating = false;
1040
1041 struct radv_buffer dst_buffer = {
1042 .bo = dst_bo,
1043 .offset = dst_offset,
1044 .size = dst_stride * count
1045 };
1046
1047 struct radv_buffer src_buffer = {
1048 .bo = src_bo,
1049 .offset = src_offset,
1050 .size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset)
1051 };
1052
1053 radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
1054 VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1055
1056 radv_meta_push_descriptor_set(cmd_buffer,
1057 VK_PIPELINE_BIND_POINT_COMPUTE,
1058 device->meta_state.query.p_layout,
1059 0, /* set */
1060 2, /* descriptorWriteCount */
1061 (VkWriteDescriptorSet[]) {
1062 {
1063 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1064 .dstBinding = 0,
1065 .dstArrayElement = 0,
1066 .descriptorCount = 1,
1067 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1068 .pBufferInfo = &(VkDescriptorBufferInfo) {
1069 .buffer = radv_buffer_to_handle(&dst_buffer),
1070 .offset = 0,
1071 .range = VK_WHOLE_SIZE
1072 }
1073 },
1074 {
1075 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1076 .dstBinding = 1,
1077 .dstArrayElement = 0,
1078 .descriptorCount = 1,
1079 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1080 .pBufferInfo = &(VkDescriptorBufferInfo) {
1081 .buffer = radv_buffer_to_handle(&src_buffer),
1082 .offset = 0,
1083 .range = VK_WHOLE_SIZE
1084 }
1085 }
1086 });
1087
1088 /* Encode the number of elements for easy access by the shader. */
1089 pipeline_stats_mask &= 0x7ff;
1090 pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1091
1092 avail_offset -= src_offset;
1093
1094 struct {
1095 uint32_t flags;
1096 uint32_t dst_stride;
1097 uint32_t pipeline_stats_mask;
1098 uint32_t avail_offset;
1099 } push_constants = {
1100 flags,
1101 dst_stride,
1102 pipeline_stats_mask,
1103 avail_offset
1104 };
1105
1106 radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
1107 device->meta_state.query.p_layout,
1108 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
1109 &push_constants);
1110
1111 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 |
1112 RADV_CMD_FLAG_INV_VCACHE;
1113
1114 if (flags & VK_QUERY_RESULT_WAIT_BIT)
1115 cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1116
1117 radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1118
1119 /* Restore conditional rendering. */
1120 cmd_buffer->state.predicating = old_predicating;
1121
1122 radv_meta_restore(&saved_state, cmd_buffer);
1123 }
1124
1125 static bool
radv_query_pool_needs_gds(struct radv_device * device,struct radv_query_pool * pool)1126 radv_query_pool_needs_gds(struct radv_device *device,
1127 struct radv_query_pool *pool)
1128 {
1129 /* The number of primitives generated by geometry shader invocations is
1130 * only counted by the hardware if GS uses the legacy path. When NGG GS
1131 * is used, the hardware can't know the number of generated primitives
1132 * and we have to it manually inside the shader. To achieve that, the
1133 * driver does a plain GDS atomic to accumulate that value.
1134 * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end
1135 * query.
1136 */
1137 return device->physical_device->use_ngg &&
1138 (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
1139 }
1140
1141 static void
radv_destroy_query_pool(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_query_pool * pool)1142 radv_destroy_query_pool(struct radv_device *device,
1143 const VkAllocationCallbacks *pAllocator,
1144 struct radv_query_pool *pool)
1145 {
1146 if (pool->bo)
1147 device->ws->buffer_destroy(pool->bo);
1148 vk_object_base_finish(&pool->base);
1149 vk_free2(&device->vk.alloc, pAllocator, pool);
1150 }
1151
radv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)1152 VkResult radv_CreateQueryPool(
1153 VkDevice _device,
1154 const VkQueryPoolCreateInfo* pCreateInfo,
1155 const VkAllocationCallbacks* pAllocator,
1156 VkQueryPool* pQueryPool)
1157 {
1158 RADV_FROM_HANDLE(radv_device, device, _device);
1159 struct radv_query_pool *pool = vk_alloc2(&device->vk.alloc, pAllocator,
1160 sizeof(*pool), 8,
1161 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1162
1163 if (!pool)
1164 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1165
1166 vk_object_base_init(&device->vk, &pool->base,
1167 VK_OBJECT_TYPE_QUERY_POOL);
1168
1169 switch(pCreateInfo->queryType) {
1170 case VK_QUERY_TYPE_OCCLUSION:
1171 pool->stride = 16 * device->physical_device->rad_info.num_render_backends;
1172 break;
1173 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1174 pool->stride = pipelinestat_block_size * 2;
1175 break;
1176 case VK_QUERY_TYPE_TIMESTAMP:
1177 pool->stride = 8;
1178 break;
1179 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1180 pool->stride = 32;
1181 break;
1182 default:
1183 unreachable("creating unhandled query type");
1184 }
1185
1186 pool->type = pCreateInfo->queryType;
1187 pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics;
1188 pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1189 pool->size = pool->availability_offset;
1190 if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
1191 pool->size += 4 * pCreateInfo->queryCount;
1192
1193 pool->bo = device->ws->buffer_create(device->ws, pool->size,
1194 64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING,
1195 RADV_BO_PRIORITY_QUERY_POOL);
1196 if (!pool->bo) {
1197 radv_destroy_query_pool(device, pAllocator, pool);
1198 return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1199 }
1200
1201 pool->ptr = device->ws->buffer_map(pool->bo);
1202 if (!pool->ptr) {
1203 radv_destroy_query_pool(device, pAllocator, pool);
1204 return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1205 }
1206
1207 *pQueryPool = radv_query_pool_to_handle(pool);
1208 return VK_SUCCESS;
1209 }
1210
radv_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)1211 void radv_DestroyQueryPool(
1212 VkDevice _device,
1213 VkQueryPool _pool,
1214 const VkAllocationCallbacks* pAllocator)
1215 {
1216 RADV_FROM_HANDLE(radv_device, device, _device);
1217 RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
1218
1219 if (!pool)
1220 return;
1221
1222 radv_destroy_query_pool(device, pAllocator, pool);
1223 }
1224
radv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)1225 VkResult radv_GetQueryPoolResults(
1226 VkDevice _device,
1227 VkQueryPool queryPool,
1228 uint32_t firstQuery,
1229 uint32_t queryCount,
1230 size_t dataSize,
1231 void* pData,
1232 VkDeviceSize stride,
1233 VkQueryResultFlags flags)
1234 {
1235 RADV_FROM_HANDLE(radv_device, device, _device);
1236 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1237 char *data = pData;
1238 VkResult result = VK_SUCCESS;
1239
1240 if (radv_device_is_lost(device))
1241 return VK_ERROR_DEVICE_LOST;
1242
1243 for(unsigned i = 0; i < queryCount; ++i, data += stride) {
1244 char *dest = data;
1245 unsigned query = firstQuery + i;
1246 char *src = pool->ptr + query * pool->stride;
1247 uint32_t available;
1248
1249 switch (pool->type) {
1250 case VK_QUERY_TYPE_TIMESTAMP: {
1251 uint64_t const *src64 = (uint64_t const *)src;
1252 uint64_t value;
1253
1254 do {
1255 value = p_atomic_read(src64);
1256 } while (value == TIMESTAMP_NOT_READY &&
1257 (flags & VK_QUERY_RESULT_WAIT_BIT));
1258
1259 available = value != TIMESTAMP_NOT_READY;
1260
1261 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1262 result = VK_NOT_READY;
1263
1264 if (flags & VK_QUERY_RESULT_64_BIT) {
1265 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1266 *(uint64_t*)dest = value;
1267 dest += 8;
1268 } else {
1269 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1270 *(uint32_t*)dest = (uint32_t)value;
1271 dest += 4;
1272 }
1273 break;
1274 }
1275 case VK_QUERY_TYPE_OCCLUSION: {
1276 uint64_t const *src64 = (uint64_t const *)src;
1277 uint32_t db_count = device->physical_device->rad_info.num_render_backends;
1278 uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
1279 uint64_t sample_count = 0;
1280 available = 1;
1281
1282 for (int i = 0; i < db_count; ++i) {
1283 uint64_t start, end;
1284
1285 if (!(enabled_rb_mask & (1 << i)))
1286 continue;
1287
1288 do {
1289 start = p_atomic_read(src64 + 2 * i);
1290 end = p_atomic_read(src64 + 2 * i + 1);
1291 } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT));
1292
1293 if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1294 available = 0;
1295 else {
1296 sample_count += end - start;
1297 }
1298 }
1299
1300 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1301 result = VK_NOT_READY;
1302
1303 if (flags & VK_QUERY_RESULT_64_BIT) {
1304 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1305 *(uint64_t*)dest = sample_count;
1306 dest += 8;
1307 } else {
1308 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1309 *(uint32_t*)dest = sample_count;
1310 dest += 4;
1311 }
1312 break;
1313 }
1314 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1315 const uint32_t *avail_ptr = (const uint32_t*)(pool->ptr + pool->availability_offset + 4 * query);
1316
1317 do {
1318 available = p_atomic_read(avail_ptr);
1319 } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT));
1320
1321 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1322 result = VK_NOT_READY;
1323
1324 const uint64_t *start = (uint64_t*)src;
1325 const uint64_t *stop = (uint64_t*)(src + pipelinestat_block_size);
1326 if (flags & VK_QUERY_RESULT_64_BIT) {
1327 uint64_t *dst = (uint64_t*)dest;
1328 dest += util_bitcount(pool->pipeline_stats_mask) * 8;
1329 for(int i = 0; i < 11; ++i) {
1330 if(pool->pipeline_stats_mask & (1u << i)) {
1331 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1332 *dst = stop[pipeline_statistics_indices[i]] -
1333 start[pipeline_statistics_indices[i]];
1334 dst++;
1335 }
1336 }
1337
1338 } else {
1339 uint32_t *dst = (uint32_t*)dest;
1340 dest += util_bitcount(pool->pipeline_stats_mask) * 4;
1341 for(int i = 0; i < 11; ++i) {
1342 if(pool->pipeline_stats_mask & (1u << i)) {
1343 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1344 *dst = stop[pipeline_statistics_indices[i]] -
1345 start[pipeline_statistics_indices[i]];
1346 dst++;
1347 }
1348 }
1349 }
1350 break;
1351 }
1352 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1353 uint64_t const *src64 = (uint64_t const *)src;
1354 uint64_t num_primitives_written;
1355 uint64_t primitive_storage_needed;
1356
1357 /* SAMPLE_STREAMOUTSTATS stores this structure:
1358 * {
1359 * u64 NumPrimitivesWritten;
1360 * u64 PrimitiveStorageNeeded;
1361 * }
1362 */
1363 available = 1;
1364 for (int j = 0; j < 4; j++) {
1365 if (!(p_atomic_read(src64 + j) & 0x8000000000000000UL))
1366 available = 0;
1367 }
1368
1369 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1370 result = VK_NOT_READY;
1371
1372 num_primitives_written = src64[3] - src64[1];
1373 primitive_storage_needed = src64[2] - src64[0];
1374
1375 if (flags & VK_QUERY_RESULT_64_BIT) {
1376 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1377 *(uint64_t *)dest = num_primitives_written;
1378 dest += 8;
1379 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1380 *(uint64_t *)dest = primitive_storage_needed;
1381 dest += 8;
1382 } else {
1383 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1384 *(uint32_t *)dest = num_primitives_written;
1385 dest += 4;
1386 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1387 *(uint32_t *)dest = primitive_storage_needed;
1388 dest += 4;
1389 }
1390 break;
1391 }
1392 default:
1393 unreachable("trying to get results of unhandled query type");
1394 }
1395
1396 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1397 if (flags & VK_QUERY_RESULT_64_BIT) {
1398 *(uint64_t*)dest = available;
1399 } else {
1400 *(uint32_t*)dest = available;
1401 }
1402 }
1403 }
1404
1405 return result;
1406 }
1407
emit_query_flush(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool)1408 static void emit_query_flush(struct radv_cmd_buffer *cmd_buffer,
1409 struct radv_query_pool *pool)
1410 {
1411 if (cmd_buffer->pending_reset_query) {
1412 if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1413 /* Only need to flush caches if the query pool size is
1414 * large enough to be resetted using the compute shader
1415 * path. Small pools don't need any cache flushes
1416 * because we use a CP dma clear.
1417 */
1418 si_emit_cache_flush(cmd_buffer);
1419 }
1420 }
1421 }
1422
radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1423 void radv_CmdCopyQueryPoolResults(
1424 VkCommandBuffer commandBuffer,
1425 VkQueryPool queryPool,
1426 uint32_t firstQuery,
1427 uint32_t queryCount,
1428 VkBuffer dstBuffer,
1429 VkDeviceSize dstOffset,
1430 VkDeviceSize stride,
1431 VkQueryResultFlags flags)
1432 {
1433 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1434 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1435 RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1436 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1437 uint64_t va = radv_buffer_get_va(pool->bo);
1438 uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1439 dest_va += dst_buffer->offset + dstOffset;
1440
1441 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
1442 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
1443
1444 /* From the Vulkan spec 1.1.108:
1445 *
1446 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1447 * previous uses of vkCmdResetQueryPool in the same queue, without any
1448 * additional synchronization."
1449 *
1450 * So, we have to flush the caches if the compute shader path was used.
1451 */
1452 emit_query_flush(cmd_buffer, pool);
1453
1454 switch (pool->type) {
1455 case VK_QUERY_TYPE_OCCLUSION:
1456 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1457 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1458 unsigned query = firstQuery + i;
1459 uint64_t src_va = va + query * pool->stride + pool->stride - 4;
1460
1461 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1462
1463 /* Waits on the upper word of the last DB entry */
1464 radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL,
1465 src_va, 0x80000000, 0xffffffff);
1466 }
1467 }
1468 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline,
1469 pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1470 dst_buffer->offset + dstOffset,
1471 pool->stride, stride,
1472 queryCount, flags, 0, 0);
1473 break;
1474 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1475 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1476 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1477 unsigned query = firstQuery + i;
1478
1479 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1480
1481 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1482
1483 /* This waits on the ME. All copies below are done on the ME */
1484 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL,
1485 avail_va, 1, 0xffffffff);
1486 }
1487 }
1488 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1489 pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1490 dst_buffer->offset + dstOffset,
1491 pool->stride, stride, queryCount, flags,
1492 pool->pipeline_stats_mask,
1493 pool->availability_offset + 4 * firstQuery);
1494 break;
1495 case VK_QUERY_TYPE_TIMESTAMP:
1496 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1497 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1498 unsigned query = firstQuery + i;
1499 uint64_t local_src_va = va + query * pool->stride;
1500
1501 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1502
1503 /* Wait on the high 32 bits of the timestamp in
1504 * case the low part is 0xffffffff.
1505 */
1506 radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL,
1507 local_src_va + 4,
1508 TIMESTAMP_NOT_READY >> 32,
1509 0xffffffff);
1510 }
1511 }
1512
1513 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline,
1514 pool->bo, dst_buffer->bo,
1515 firstQuery * pool->stride,
1516 dst_buffer->offset + dstOffset,
1517 pool->stride, stride,
1518 queryCount, flags, 0, 0);
1519 break;
1520 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1521 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1522 for(unsigned i = 0; i < queryCount; i++) {
1523 unsigned query = firstQuery + i;
1524 uint64_t src_va = va + query * pool->stride;
1525
1526 radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1527
1528 /* Wait on the upper word of all results. */
1529 for (unsigned j = 0; j < 4; j++, src_va += 8) {
1530 radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL,
1531 src_va + 4, 0x80000000,
1532 0xffffffff);
1533 }
1534 }
1535 }
1536
1537 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline,
1538 pool->bo, dst_buffer->bo,
1539 firstQuery * pool->stride,
1540 dst_buffer->offset + dstOffset,
1541 pool->stride, stride,
1542 queryCount, flags, 0, 0);
1543 break;
1544 default:
1545 unreachable("trying to get results of unhandled query type");
1546 }
1547
1548 }
1549
radv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1550 void radv_CmdResetQueryPool(
1551 VkCommandBuffer commandBuffer,
1552 VkQueryPool queryPool,
1553 uint32_t firstQuery,
1554 uint32_t queryCount)
1555 {
1556 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1557 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1558 uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP
1559 ? (uint32_t)TIMESTAMP_NOT_READY : 0;
1560 uint32_t flush_bits = 0;
1561
1562 /* Make sure to sync all previous work if the given command buffer has
1563 * pending active queries. Otherwise the GPU might write queries data
1564 * after the reset operation.
1565 */
1566 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1567
1568 flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1569 firstQuery * pool->stride,
1570 queryCount * pool->stride, value);
1571
1572 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1573 flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1574 pool->availability_offset + firstQuery * 4,
1575 queryCount * 4, 0);
1576 }
1577
1578 if (flush_bits) {
1579 /* Only need to flush caches for the compute shader path. */
1580 cmd_buffer->pending_reset_query = true;
1581 cmd_buffer->state.flush_bits |= flush_bits;
1582 }
1583 }
1584
radv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1585 void radv_ResetQueryPool(
1586 VkDevice _device,
1587 VkQueryPool queryPool,
1588 uint32_t firstQuery,
1589 uint32_t queryCount)
1590 {
1591 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1592
1593 uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP
1594 ? (uint32_t)TIMESTAMP_NOT_READY : 0;
1595 uint32_t *data = (uint32_t*)(pool->ptr + firstQuery * pool->stride);
1596 uint32_t *data_end = (uint32_t*)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1597
1598 for(uint32_t *p = data; p != data_end; ++p)
1599 *p = value;
1600
1601 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1602 memset(pool->ptr + pool->availability_offset + firstQuery * 4,
1603 0, queryCount * 4);
1604 }
1605 }
1606
event_type_for_stream(unsigned stream)1607 static unsigned event_type_for_stream(unsigned stream)
1608 {
1609 switch (stream) {
1610 default:
1611 case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
1612 case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
1613 case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
1614 case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
1615 }
1616 }
1617
emit_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,VkQueryType query_type,VkQueryControlFlags flags,uint32_t index)1618 static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
1619 struct radv_query_pool *pool,
1620 uint64_t va,
1621 VkQueryType query_type,
1622 VkQueryControlFlags flags,
1623 uint32_t index)
1624 {
1625 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1626 switch (query_type) {
1627 case VK_QUERY_TYPE_OCCLUSION:
1628 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1629
1630 ++cmd_buffer->state.active_occlusion_queries;
1631 if (cmd_buffer->state.active_occlusion_queries == 1) {
1632 if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
1633 /* This is the first occlusion query, enable
1634 * the hint if the precision bit is set.
1635 */
1636 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1637 }
1638
1639 radv_set_db_count_control(cmd_buffer);
1640 } else {
1641 if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) &&
1642 !cmd_buffer->state.perfect_occlusion_queries_enabled) {
1643 /* This is not the first query, but this one
1644 * needs to enable precision, DB_COUNT_CONTROL
1645 * has to be updated accordingly.
1646 */
1647 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1648
1649 radv_set_db_count_control(cmd_buffer);
1650 }
1651 }
1652
1653 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1654 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1655 radeon_emit(cs, va);
1656 radeon_emit(cs, va >> 32);
1657 break;
1658 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1659 radeon_check_space(cmd_buffer->device->ws, cs, 4);
1660
1661 ++cmd_buffer->state.active_pipeline_queries;
1662 if (cmd_buffer->state.active_pipeline_queries == 1) {
1663 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1664 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
1665 }
1666
1667 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1668 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1669 radeon_emit(cs, va);
1670 radeon_emit(cs, va >> 32);
1671
1672 if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
1673 int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
1674
1675 /* Make sure GDS is idle before copying the value. */
1676 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1677 RADV_CMD_FLAG_INV_L2;
1678 si_emit_cache_flush(cmd_buffer);
1679
1680 va += 8 * idx;
1681
1682 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1683 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) |
1684 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
1685 COPY_DATA_WR_CONFIRM);
1686 radeon_emit(cs, 0);
1687 radeon_emit(cs, 0);
1688 radeon_emit(cs, va);
1689 radeon_emit(cs, va >> 32);
1690
1691 /* Record that the command buffer needs GDS. */
1692 cmd_buffer->gds_needed = true;
1693
1694 cmd_buffer->state.active_pipeline_gds_queries++;
1695 }
1696 break;
1697 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1698 radeon_check_space(cmd_buffer->device->ws, cs, 4);
1699
1700 assert(index < MAX_SO_STREAMS);
1701
1702 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1703 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1704 radeon_emit(cs, va);
1705 radeon_emit(cs, va >> 32);
1706 break;
1707 default:
1708 unreachable("beginning unhandled query type");
1709 }
1710
1711 }
1712
emit_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va,VkQueryType query_type,uint32_t index)1713 static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
1714 struct radv_query_pool *pool,
1715 uint64_t va, uint64_t avail_va,
1716 VkQueryType query_type, uint32_t index)
1717 {
1718 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1719 switch (query_type) {
1720 case VK_QUERY_TYPE_OCCLUSION:
1721 radeon_check_space(cmd_buffer->device->ws, cs, 14);
1722
1723 cmd_buffer->state.active_occlusion_queries--;
1724 if (cmd_buffer->state.active_occlusion_queries == 0) {
1725 radv_set_db_count_control(cmd_buffer);
1726
1727 /* Reset the perfect occlusion queries hint now that no
1728 * queries are active.
1729 */
1730 cmd_buffer->state.perfect_occlusion_queries_enabled = false;
1731 }
1732
1733 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1734 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1735 radeon_emit(cs, va + 8);
1736 radeon_emit(cs, (va + 8) >> 32);
1737
1738 break;
1739 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1740 radeon_check_space(cmd_buffer->device->ws, cs, 16);
1741
1742 cmd_buffer->state.active_pipeline_queries--;
1743 if (cmd_buffer->state.active_pipeline_queries == 0) {
1744 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1745 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1746 }
1747 va += pipelinestat_block_size;
1748
1749 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1750 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1751 radeon_emit(cs, va);
1752 radeon_emit(cs, va >> 32);
1753
1754 si_cs_emit_write_event_eop(cs,
1755 cmd_buffer->device->physical_device->rad_info.chip_class,
1756 radv_cmd_buffer_uses_mec(cmd_buffer),
1757 V_028A90_BOTTOM_OF_PIPE_TS, 0,
1758 EOP_DST_SEL_MEM,
1759 EOP_DATA_SEL_VALUE_32BIT,
1760 avail_va, 1,
1761 cmd_buffer->gfx9_eop_bug_va);
1762
1763 if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
1764 int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
1765
1766 /* Make sure GDS is idle before copying the value. */
1767 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1768 RADV_CMD_FLAG_INV_L2;
1769 si_emit_cache_flush(cmd_buffer);
1770
1771 va += 8 * idx;
1772
1773 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1774 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) |
1775 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
1776 COPY_DATA_WR_CONFIRM);
1777 radeon_emit(cs, 0);
1778 radeon_emit(cs, 0);
1779 radeon_emit(cs, va);
1780 radeon_emit(cs, va >> 32);
1781
1782 cmd_buffer->state.active_pipeline_gds_queries--;
1783 }
1784 break;
1785 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1786 radeon_check_space(cmd_buffer->device->ws, cs, 4);
1787
1788 assert(index < MAX_SO_STREAMS);
1789
1790 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1791 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1792 radeon_emit(cs, (va + 16));
1793 radeon_emit(cs, (va + 16) >> 32);
1794 break;
1795 default:
1796 unreachable("ending unhandled query type");
1797 }
1798
1799 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1800 RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
1801 RADV_CMD_FLAG_INV_L2 |
1802 RADV_CMD_FLAG_INV_VCACHE;
1803 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1804 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
1805 RADV_CMD_FLAG_FLUSH_AND_INV_DB;
1806 }
1807 }
1808
radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)1809 void radv_CmdBeginQueryIndexedEXT(
1810 VkCommandBuffer commandBuffer,
1811 VkQueryPool queryPool,
1812 uint32_t query,
1813 VkQueryControlFlags flags,
1814 uint32_t index)
1815 {
1816 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1817 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1818 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1819 uint64_t va = radv_buffer_get_va(pool->bo);
1820
1821 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1822
1823 emit_query_flush(cmd_buffer, pool);
1824
1825 va += pool->stride * query;
1826
1827 emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
1828 }
1829
radv_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1830 void radv_CmdBeginQuery(
1831 VkCommandBuffer commandBuffer,
1832 VkQueryPool queryPool,
1833 uint32_t query,
1834 VkQueryControlFlags flags)
1835 {
1836 radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0);
1837 }
1838
radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1839 void radv_CmdEndQueryIndexedEXT(
1840 VkCommandBuffer commandBuffer,
1841 VkQueryPool queryPool,
1842 uint32_t query,
1843 uint32_t index)
1844 {
1845 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1846 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1847 uint64_t va = radv_buffer_get_va(pool->bo);
1848 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1849 va += pool->stride * query;
1850
1851 /* Do not need to add the pool BO to the list because the query must
1852 * currently be active, which means the BO is already in the list.
1853 */
1854 emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
1855
1856 /*
1857 * For multiview we have to emit a query for each bit in the mask,
1858 * however the first query we emit will get the totals for all the
1859 * operations, so we don't want to get a real value in the other
1860 * queries. This emits a fake begin/end sequence so the waiting
1861 * code gets a completed query value and doesn't hang, but the
1862 * query returns 0.
1863 */
1864 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1865 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1866
1867
1868 for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
1869 va += pool->stride;
1870 avail_va += 4;
1871 emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
1872 emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
1873 }
1874 }
1875 }
1876
radv_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1877 void radv_CmdEndQuery(
1878 VkCommandBuffer commandBuffer,
1879 VkQueryPool queryPool,
1880 uint32_t query)
1881 {
1882 radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0);
1883 }
1884
radv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits pipelineStage,VkQueryPool queryPool,uint32_t query)1885 void radv_CmdWriteTimestamp(
1886 VkCommandBuffer commandBuffer,
1887 VkPipelineStageFlagBits pipelineStage,
1888 VkQueryPool queryPool,
1889 uint32_t query)
1890 {
1891 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1892 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1893 bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
1894 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1895 uint64_t va = radv_buffer_get_va(pool->bo);
1896 uint64_t query_va = va + pool->stride * query;
1897
1898 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1899
1900 emit_query_flush(cmd_buffer, pool);
1901
1902 int num_queries = 1;
1903 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
1904 num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask);
1905
1906 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
1907
1908 for (unsigned i = 0; i < num_queries; i++) {
1909 switch(pipelineStage) {
1910 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1911 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1912 radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
1913 COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
1914 COPY_DATA_DST_SEL(V_370_MEM));
1915 radeon_emit(cs, 0);
1916 radeon_emit(cs, 0);
1917 radeon_emit(cs, query_va);
1918 radeon_emit(cs, query_va >> 32);
1919 break;
1920 default:
1921 si_cs_emit_write_event_eop(cs,
1922 cmd_buffer->device->physical_device->rad_info.chip_class,
1923 mec,
1924 V_028A90_BOTTOM_OF_PIPE_TS, 0,
1925 EOP_DST_SEL_MEM,
1926 EOP_DATA_SEL_TIMESTAMP,
1927 query_va, 0,
1928 cmd_buffer->gfx9_eop_bug_va);
1929 break;
1930 }
1931 query_va += pool->stride;
1932 }
1933
1934 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1935 RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
1936 RADV_CMD_FLAG_INV_L2 |
1937 RADV_CMD_FLAG_INV_VCACHE;
1938 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1939 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
1940 RADV_CMD_FLAG_FLUSH_AND_INV_DB;
1941 }
1942
1943 assert(cmd_buffer->cs->cdw <= cdw_max);
1944 }
1945