1 /*
2 * Copyright © 2020 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25
26 #include "util/timespec.h"
27 #include "compiler/nir/nir_builder.h"
28
29 static void
kperfmon_create(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query)30 kperfmon_create(struct v3dv_device *device,
31 struct v3dv_query_pool *pool,
32 uint32_t query)
33 {
34 for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
35 assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
36
37 struct drm_v3d_perfmon_create req = {
38 .ncounters = MIN2(pool->perfmon.ncounters -
39 i * DRM_V3D_MAX_PERF_COUNTERS,
40 DRM_V3D_MAX_PERF_COUNTERS),
41 };
42 memcpy(req.counters,
43 &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
44 req.ncounters);
45
46 int ret = v3dv_ioctl(device->pdevice->render_fd,
47 DRM_IOCTL_V3D_PERFMON_CREATE,
48 &req);
49 if (ret)
50 fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));
51
52 pool->queries[query].perf.kperfmon_ids[i] = req.id;
53 }
54 }
55
56 static void
kperfmon_destroy(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query)57 kperfmon_destroy(struct v3dv_device *device,
58 struct v3dv_query_pool *pool,
59 uint32_t query)
60 {
61 /* Skip destroying if never created */
62 if (!pool->queries[query].perf.kperfmon_ids[0])
63 return;
64
65 for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
66 struct drm_v3d_perfmon_destroy req = {
67 .id = pool->queries[query].perf.kperfmon_ids[i]
68 };
69
70 int ret = v3dv_ioctl(device->pdevice->render_fd,
71 DRM_IOCTL_V3D_PERFMON_DESTROY,
72 &req);
73
74 if (ret) {
75 fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
76 req.id, strerror(ret));
77 }
78 }
79 }
80
81 /**
82 * Creates a VkBuffer (and VkDeviceMemory) to access a BO.
83 */
84 static VkResult
create_vk_storage_buffer(struct v3dv_device * device,struct v3dv_bo * bo,VkBuffer * vk_buf,VkDeviceMemory * vk_mem)85 create_vk_storage_buffer(struct v3dv_device *device,
86 struct v3dv_bo *bo,
87 VkBuffer *vk_buf,
88 VkDeviceMemory *vk_mem)
89 {
90 VkDevice vk_device = v3dv_device_to_handle(device);
91
92 VkBufferCreateInfo buf_info = {
93 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
94 .size = bo->size,
95 .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
96 };
97 VkResult result = v3dv_CreateBuffer(vk_device, &buf_info, NULL, vk_buf);
98 if (result != VK_SUCCESS)
99 return result;
100
101 struct v3dv_device_memory *mem =
102 vk_object_zalloc(&device->vk, NULL, sizeof(*mem),
103 VK_OBJECT_TYPE_DEVICE_MEMORY);
104 if (!mem)
105 return VK_ERROR_OUT_OF_HOST_MEMORY;
106
107 mem->bo = bo;
108 mem->type = &device->pdevice->memory.memoryTypes[0];
109
110 *vk_mem = v3dv_device_memory_to_handle(mem);
111 VkBindBufferMemoryInfo bind_info = {
112 .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
113 .buffer = *vk_buf,
114 .memory = *vk_mem,
115 .memoryOffset = 0,
116 };
117 v3dv_BindBufferMemory2(vk_device, 1, &bind_info);
118
119 return VK_SUCCESS;
120 }
121
122 static void
destroy_vk_storage_buffer(struct v3dv_device * device,VkBuffer * vk_buf,VkDeviceMemory * vk_mem)123 destroy_vk_storage_buffer(struct v3dv_device *device,
124 VkBuffer *vk_buf,
125 VkDeviceMemory *vk_mem)
126 {
127 if (*vk_mem) {
128 vk_object_free(&device->vk, NULL, v3dv_device_memory_from_handle(*vk_mem));
129 *vk_mem = VK_NULL_HANDLE;
130 }
131
132 v3dv_DestroyBuffer(v3dv_device_to_handle(device), *vk_buf, NULL);
133 *vk_buf = VK_NULL_HANDLE;
134 }
135
136 /**
137 * Allocates descriptor sets to access query pool BO (availability and
138 * occlusion query results) from Vulkan pipelines.
139 */
140 static VkResult
create_pool_descriptors(struct v3dv_device * device,struct v3dv_query_pool * pool)141 create_pool_descriptors(struct v3dv_device *device,
142 struct v3dv_query_pool *pool)
143 {
144 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
145 VkDevice vk_device = v3dv_device_to_handle(device);
146
147 VkDescriptorPoolSize pool_size = {
148 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
149 .descriptorCount = 1,
150 };
151 VkDescriptorPoolCreateInfo pool_info = {
152 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
153 .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
154 .maxSets = 1,
155 .poolSizeCount = 1,
156 .pPoolSizes = &pool_size,
157 };
158 VkResult result =
159 v3dv_CreateDescriptorPool(vk_device, &pool_info, NULL,
160 &pool->meta.descriptor_pool);
161
162 if (result != VK_SUCCESS)
163 return result;
164
165 VkDescriptorSetAllocateInfo alloc_info = {
166 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
167 .descriptorPool = pool->meta.descriptor_pool,
168 .descriptorSetCount = 1,
169 .pSetLayouts = &device->queries.buf_descriptor_set_layout,
170 };
171 result = v3dv_AllocateDescriptorSets(vk_device, &alloc_info,
172 &pool->meta.descriptor_set);
173 if (result != VK_SUCCESS)
174 return result;
175
176 VkDescriptorBufferInfo desc_buf_info = {
177 .buffer = pool->meta.buf,
178 .offset = 0,
179 .range = VK_WHOLE_SIZE,
180 };
181
182 VkWriteDescriptorSet write = {
183 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
184 .dstSet = pool->meta.descriptor_set,
185 .dstBinding = 0,
186 .dstArrayElement = 0,
187 .descriptorCount = 1,
188 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
189 .pBufferInfo = &desc_buf_info,
190 };
191 v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
192
193 return VK_SUCCESS;
194 }
195
196 static void
destroy_pool_descriptors(struct v3dv_device * device,struct v3dv_query_pool * pool)197 destroy_pool_descriptors(struct v3dv_device *device,
198 struct v3dv_query_pool *pool)
199 {
200 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
201
202 v3dv_FreeDescriptorSets(v3dv_device_to_handle(device),
203 pool->meta.descriptor_pool,
204 1, &pool->meta.descriptor_set);
205 pool->meta.descriptor_set = VK_NULL_HANDLE;
206
207 v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device),
208 pool->meta.descriptor_pool, NULL);
209 pool->meta.descriptor_pool = VK_NULL_HANDLE;
210 }
211
212 static VkResult
pool_create_meta_resources(struct v3dv_device * device,struct v3dv_query_pool * pool)213 pool_create_meta_resources(struct v3dv_device *device,
214 struct v3dv_query_pool *pool)
215 {
216 VkResult result;
217
218 if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
219 return VK_SUCCESS;
220
221 result = create_vk_storage_buffer(device, pool->occlusion.bo,
222 &pool->meta.buf, &pool->meta.mem);
223 if (result != VK_SUCCESS)
224 return result;
225
226 result = create_pool_descriptors(device, pool);
227 if (result != VK_SUCCESS)
228 return result;
229
230 return VK_SUCCESS;
231 }
232
233 static void
pool_destroy_meta_resources(struct v3dv_device * device,struct v3dv_query_pool * pool)234 pool_destroy_meta_resources(struct v3dv_device *device,
235 struct v3dv_query_pool *pool)
236 {
237 if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
238 return;
239
240 destroy_pool_descriptors(device, pool);
241 destroy_vk_storage_buffer(device, &pool->meta.buf, &pool->meta.mem);
242 }
243
244 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)245 v3dv_CreateQueryPool(VkDevice _device,
246 const VkQueryPoolCreateInfo *pCreateInfo,
247 const VkAllocationCallbacks *pAllocator,
248 VkQueryPool *pQueryPool)
249 {
250 V3DV_FROM_HANDLE(v3dv_device, device, _device);
251
252 assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
253 pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
254 pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
255 assert(pCreateInfo->queryCount > 0);
256
257 struct v3dv_query_pool *pool =
258 vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
259 VK_OBJECT_TYPE_QUERY_POOL);
260 if (pool == NULL)
261 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
262
263 pool->query_type = pCreateInfo->queryType;
264 pool->query_count = pCreateInfo->queryCount;
265
266 uint32_t query_idx = 0;
267 VkResult result;
268
269 const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
270 pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
271 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
272 if (pool->queries == NULL) {
273 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
274 goto fail;
275 }
276
277 switch (pool->query_type) {
278 case VK_QUERY_TYPE_OCCLUSION: {
279 /* The hardware allows us to setup groups of 16 queries in consecutive
280 * 4-byte addresses, requiring only that each group of 16 queries is
281 * aligned to a 1024 byte boundary.
282 */
283 const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
284 uint32_t bo_size = query_groups * 1024;
285 /* After the counters we store avalability data, 1 byte/query */
286 pool->occlusion.avail_offset = bo_size;
287 bo_size += pool->query_count;
288 pool->occlusion.bo = v3dv_bo_alloc(device, bo_size, "query:o", true);
289 if (!pool->occlusion.bo) {
290 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
291 goto fail;
292 }
293 if (!v3dv_bo_map(device, pool->occlusion.bo, bo_size)) {
294 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
295 goto fail;
296 }
297 break;
298 }
299 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
300 const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
301 vk_find_struct_const(pCreateInfo->pNext,
302 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
303
304 assert(pq_info);
305
306 pool->perfmon.ncounters = pq_info->counterIndexCount;
307 for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
308 pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
309
310 pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
311 DRM_V3D_MAX_PERF_COUNTERS);
312
313 assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
314 break;
315 }
316 case VK_QUERY_TYPE_TIMESTAMP: {
317 /* 8 bytes per query used for the timestamp value. We have all
318 * timestamps tightly packed first in the buffer.
319 */
320 const uint32_t bo_size = pool->query_count * 8;
321 pool->timestamp.bo = v3dv_bo_alloc(device, bo_size, "query:t", true);
322 if (!pool->timestamp.bo) {
323 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
324 goto fail;
325 }
326 if (!v3dv_bo_map(device, pool->timestamp.bo, bo_size)) {
327 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
328 goto fail;
329 }
330 break;
331 }
332 default:
333 unreachable("Unsupported query type");
334 }
335
336 /* Initialize queries in the pool */
337 for (; query_idx < pool->query_count; query_idx++) {
338 pool->queries[query_idx].maybe_available = false;
339 switch (pool->query_type) {
340 case VK_QUERY_TYPE_OCCLUSION: {
341 const uint32_t query_group = query_idx / 16;
342 const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
343 pool->queries[query_idx].occlusion.offset = query_offset;
344 break;
345 }
346 case VK_QUERY_TYPE_TIMESTAMP:
347 pool->queries[query_idx].timestamp.offset = query_idx * 8;
348 result = vk_sync_create(&device->vk,
349 &device->pdevice->drm_syncobj_type, 0, 0,
350 &pool->queries[query_idx].timestamp.sync);
351 if (result != VK_SUCCESS)
352 goto fail;
353 break;
354 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
355 result = vk_sync_create(&device->vk,
356 &device->pdevice->drm_syncobj_type, 0, 0,
357 &pool->queries[query_idx].perf.last_job_sync);
358 if (result != VK_SUCCESS)
359 goto fail;
360
361 kperfmon_create(device, pool, query_idx);
362 break;
363 }
364 default:
365 unreachable("Unsupported query type");
366 }
367 }
368
369 /* Create meta resources */
370 result = pool_create_meta_resources(device, pool);
371 if (result != VK_SUCCESS)
372 goto fail;
373
374 *pQueryPool = v3dv_query_pool_to_handle(pool);
375
376 return VK_SUCCESS;
377
378 fail:
379 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
380 for (uint32_t j = 0; j < query_idx; j++)
381 vk_sync_destroy(&device->vk, pool->queries[j].timestamp.sync);
382 }
383
384 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
385 for (uint32_t j = 0; j < query_idx; j++)
386 vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
387 }
388
389 if (pool->occlusion.bo)
390 v3dv_bo_free(device, pool->occlusion.bo);
391 if (pool->timestamp.bo)
392 v3dv_bo_free(device, pool->timestamp.bo);
393 if (pool->queries)
394 vk_free2(&device->vk.alloc, pAllocator, pool->queries);
395 pool_destroy_meta_resources(device, pool);
396 vk_object_free(&device->vk, pAllocator, pool);
397
398 return result;
399 }
400
401 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyQueryPool(VkDevice _device,VkQueryPool queryPool,const VkAllocationCallbacks * pAllocator)402 v3dv_DestroyQueryPool(VkDevice _device,
403 VkQueryPool queryPool,
404 const VkAllocationCallbacks *pAllocator)
405 {
406 V3DV_FROM_HANDLE(v3dv_device, device, _device);
407 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
408
409 if (!pool)
410 return;
411
412 if (pool->occlusion.bo)
413 v3dv_bo_free(device, pool->occlusion.bo);
414
415 if (pool->timestamp.bo)
416 v3dv_bo_free(device, pool->timestamp.bo);
417
418 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
419 for (uint32_t i = 0; i < pool->query_count; i++)
420 vk_sync_destroy(&device->vk, pool->queries[i].timestamp.sync);
421 }
422
423 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
424 for (uint32_t i = 0; i < pool->query_count; i++) {
425 kperfmon_destroy(device, pool, i);
426 vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
427 }
428 }
429
430 if (pool->queries)
431 vk_free2(&device->vk.alloc, pAllocator, pool->queries);
432
433 pool_destroy_meta_resources(device, pool);
434
435 vk_object_free(&device->vk, pAllocator, pool);
436 }
437
438 static void
write_to_buffer(void * dst,uint32_t idx,bool do_64bit,uint64_t value)439 write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
440 {
441 if (do_64bit) {
442 uint64_t *dst64 = (uint64_t *) dst;
443 dst64[idx] = value;
444 } else {
445 uint32_t *dst32 = (uint32_t *) dst;
446 dst32[idx] = (uint32_t) value;
447 }
448 }
449
450 static VkResult
query_wait_available(struct v3dv_device * device,struct v3dv_query_pool * pool,struct v3dv_query * q,uint32_t query_idx)451 query_wait_available(struct v3dv_device *device,
452 struct v3dv_query_pool *pool,
453 struct v3dv_query *q,
454 uint32_t query_idx)
455 {
456 /* For occlusion queries we prefer to poll the availability BO in a loop
457 * to waiting on the query results BO, because the latter would
458 * make us wait for any job running queries from the pool, even if those
459 * queries do not involve the one we want to wait on.
460 */
461 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
462 uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
463 pool->occlusion.avail_offset + query_idx;
464 while (*q_addr == 0)
465 usleep(250);
466 return VK_SUCCESS;
467 }
468
469 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
470 if (vk_sync_wait(&device->vk, q->timestamp.sync,
471 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
472 return vk_device_set_lost(&device->vk, "Query job wait failed");
473 }
474 return VK_SUCCESS;
475 }
476
477 assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
478
479 /* For performance queries we need to wait for the queue to signal that
480 * the query has been submitted for execution before anything else.
481 */
482 VkResult result = VK_SUCCESS;
483 if (!q->maybe_available) {
484 struct timespec timeout;
485 timespec_get(&timeout, TIME_UTC);
486 timespec_add_msec(&timeout, &timeout, 2000);
487
488 mtx_lock(&device->query_mutex);
489 while (!q->maybe_available) {
490 if (vk_device_is_lost(&device->vk)) {
491 result = VK_ERROR_DEVICE_LOST;
492 break;
493 }
494
495 int ret = cnd_timedwait(&device->query_ended,
496 &device->query_mutex,
497 &timeout);
498 if (ret != thrd_success) {
499 mtx_unlock(&device->query_mutex);
500 result = vk_device_set_lost(&device->vk, "Query wait failed");
501 break;
502 }
503 }
504 mtx_unlock(&device->query_mutex);
505
506 if (result != VK_SUCCESS)
507 return result;
508
509 /* For performance queries, we also need to wait for the relevant syncobj
510 * to be signaled to ensure completion of the GPU work.
511 */
512 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
513 vk_sync_wait(&device->vk, q->perf.last_job_sync,
514 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
515 return vk_device_set_lost(&device->vk, "Query job wait failed");
516 }
517 }
518
519 return result;
520 }
521
522 static VkResult
query_check_available(struct v3dv_device * device,struct v3dv_query_pool * pool,struct v3dv_query * q,uint32_t query_idx)523 query_check_available(struct v3dv_device *device,
524 struct v3dv_query_pool *pool,
525 struct v3dv_query *q,
526 uint32_t query_idx)
527 {
528 /* For occlusion we check the availability BO */
529 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
530 const uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
531 pool->occlusion.avail_offset + query_idx;
532 return (*q_addr != 0) ? VK_SUCCESS : VK_NOT_READY;
533 }
534
535 /* For timestamp queries, we need to check if the relevant job
536 * has completed.
537 */
538 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
539 if (vk_sync_wait(&device->vk, q->timestamp.sync,
540 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
541 return VK_NOT_READY;
542 }
543 return VK_SUCCESS;
544 }
545
546 /* For other queries we need to check if the queue has submitted the query
547 * for execution at all.
548 */
549 assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
550 if (!q->maybe_available)
551 return VK_NOT_READY;
552
553 /* For performance queries, we also need to check if the relevant GPU job
554 * has completed.
555 */
556 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
557 vk_sync_wait(&device->vk, q->perf.last_job_sync,
558 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
559 return VK_NOT_READY;
560 }
561
562 return VK_SUCCESS;
563 }
564
565 static VkResult
query_is_available(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_wait,bool * available)566 query_is_available(struct v3dv_device *device,
567 struct v3dv_query_pool *pool,
568 uint32_t query,
569 bool do_wait,
570 bool *available)
571 {
572 struct v3dv_query *q = &pool->queries[query];
573
574 if (do_wait) {
575 VkResult result = query_wait_available(device, pool, q, query);
576 if (result != VK_SUCCESS) {
577 *available = false;
578 return result;
579 }
580
581 *available = true;
582 } else {
583 VkResult result = query_check_available(device, pool, q, query);
584 assert(result == VK_SUCCESS || result == VK_NOT_READY);
585 *available = (result == VK_SUCCESS);
586 }
587
588 return VK_SUCCESS;
589 }
590
591 static VkResult
write_occlusion_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)592 write_occlusion_query_result(struct v3dv_device *device,
593 struct v3dv_query_pool *pool,
594 uint32_t query,
595 bool do_64bit,
596 void *data,
597 uint32_t slot)
598 {
599 assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
600
601 if (vk_device_is_lost(&device->vk))
602 return VK_ERROR_DEVICE_LOST;
603
604 struct v3dv_query *q = &pool->queries[query];
605 assert(pool->occlusion.bo && pool->occlusion.bo->map);
606
607 const uint8_t *query_addr =
608 ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
609 write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
610 return VK_SUCCESS;
611 }
612
613 static VkResult
write_timestamp_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)614 write_timestamp_query_result(struct v3dv_device *device,
615 struct v3dv_query_pool *pool,
616 uint32_t query,
617 bool do_64bit,
618 void *data,
619 uint32_t slot)
620 {
621 assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
622
623 struct v3dv_query *q = &pool->queries[query];
624
625 const uint8_t *query_addr =
626 ((uint8_t *) pool->timestamp.bo->map) + q->timestamp.offset;
627
628 write_to_buffer(data, slot, do_64bit, *((uint64_t *)query_addr));
629 return VK_SUCCESS;
630 }
631
632 static VkResult
write_performance_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)633 write_performance_query_result(struct v3dv_device *device,
634 struct v3dv_query_pool *pool,
635 uint32_t query,
636 bool do_64bit,
637 void *data,
638 uint32_t slot)
639 {
640 assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
641
642 struct v3dv_query *q = &pool->queries[query];
643 uint64_t counter_values[V3D_MAX_PERFCNT];
644
645 for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
646 struct drm_v3d_perfmon_get_values req = {
647 .id = q->perf.kperfmon_ids[i],
648 .values_ptr = (uintptr_t)(&counter_values[i *
649 DRM_V3D_MAX_PERF_COUNTERS])
650 };
651
652 int ret = v3dv_ioctl(device->pdevice->render_fd,
653 DRM_IOCTL_V3D_PERFMON_GET_VALUES,
654 &req);
655
656 if (ret) {
657 fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
658 return vk_error(device, VK_ERROR_DEVICE_LOST);
659 }
660 }
661
662 for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
663 write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
664
665 return VK_SUCCESS;
666 }
667
668 static VkResult
write_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)669 write_query_result(struct v3dv_device *device,
670 struct v3dv_query_pool *pool,
671 uint32_t query,
672 bool do_64bit,
673 void *data,
674 uint32_t slot)
675 {
676 switch (pool->query_type) {
677 case VK_QUERY_TYPE_OCCLUSION:
678 return write_occlusion_query_result(device, pool, query, do_64bit,
679 data, slot);
680 case VK_QUERY_TYPE_TIMESTAMP:
681 return write_timestamp_query_result(device, pool, query, do_64bit,
682 data, slot);
683 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
684 return write_performance_query_result(device, pool, query, do_64bit,
685 data, slot);
686 default:
687 unreachable("Unsupported query type");
688 }
689 }
690
691 static uint32_t
get_query_result_count(struct v3dv_query_pool * pool)692 get_query_result_count(struct v3dv_query_pool *pool)
693 {
694 switch (pool->query_type) {
695 case VK_QUERY_TYPE_OCCLUSION:
696 case VK_QUERY_TYPE_TIMESTAMP:
697 return 1;
698 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
699 return pool->perfmon.ncounters;
700 default:
701 unreachable("Unsupported query type");
702 }
703 }
704
705 VkResult
v3dv_get_query_pool_results_cpu(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,void * data,VkDeviceSize stride,VkQueryResultFlags flags)706 v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
707 struct v3dv_query_pool *pool,
708 uint32_t first,
709 uint32_t count,
710 void *data,
711 VkDeviceSize stride,
712 VkQueryResultFlags flags)
713 {
714 assert(first < pool->query_count);
715 assert(first + count <= pool->query_count);
716 assert(data);
717
718 const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
719 pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
720 const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
721 const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
722
723 uint32_t result_count = get_query_result_count(pool);
724
725 VkResult result = VK_SUCCESS;
726 for (uint32_t i = first; i < first + count; i++) {
727 bool available = false;
728 VkResult query_result =
729 query_is_available(device, pool, i, do_wait, &available);
730 if (query_result == VK_ERROR_DEVICE_LOST)
731 result = VK_ERROR_DEVICE_LOST;
732
733 /**
734 * From the Vulkan 1.0 spec:
735 *
736 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
737 * both not set then no result values are written to pData for queries
738 * that are in the unavailable state at the time of the call, and
739 * vkGetQueryPoolResults returns VK_NOT_READY. However, availability
740 * state is still written to pData for those queries if
741 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
742 */
743 uint32_t slot = 0;
744
745 const bool write_result = available || do_partial;
746 if (write_result)
747 write_query_result(device, pool, i, do_64bit, data, slot);
748 slot += result_count;
749
750 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
751 write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
752
753 if (!write_result && result != VK_ERROR_DEVICE_LOST)
754 result = VK_NOT_READY;
755
756 data += stride;
757 }
758
759 return result;
760 }
761
762 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)763 v3dv_GetQueryPoolResults(VkDevice _device,
764 VkQueryPool queryPool,
765 uint32_t firstQuery,
766 uint32_t queryCount,
767 size_t dataSize,
768 void *pData,
769 VkDeviceSize stride,
770 VkQueryResultFlags flags)
771 {
772 V3DV_FROM_HANDLE(v3dv_device, device, _device);
773 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
774
775 return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount,
776 pData, stride, flags);
777 }
778
779 /* Emits a series of vkCmdDispatchBase calls to execute all the workgroups
780 * required to handle a number of queries considering per-dispatch limits.
781 */
782 static void
cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer * cmd_buffer,uint32_t query_count)783 cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer *cmd_buffer,
784 uint32_t query_count)
785 {
786 VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
787
788 uint32_t dispatched = 0;
789 const uint32_t max_batch_size = 65535;
790 while (dispatched < query_count) {
791 uint32_t batch_size = MIN2(query_count - dispatched, max_batch_size);
792 v3dv_CmdDispatchBase(vk_cmd_buffer, dispatched, 0, 0, batch_size, 1, 1);
793 dispatched += batch_size;
794 }
795 }
796
797 void
v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count,uint8_t availability)798 v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
799 struct v3dv_query_pool *pool,
800 uint32_t query, uint32_t count,
801 uint8_t availability)
802 {
803 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
804 pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
805
806 struct v3dv_device *device = cmd_buffer->device;
807 VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
808
809 /* We are about to emit a compute job to set query availability and we need
810 * to ensure this executes after the graphics work using the queries has
811 * completed.
812 */
813 VkMemoryBarrier2 barrier = {
814 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
815 .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
816 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
817 };
818 VkDependencyInfo barrier_info = {
819 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
820 .memoryBarrierCount = 1,
821 .pMemoryBarriers = &barrier,
822 };
823 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
824
825 /* Dispatch queries */
826 v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
827
828 v3dv_CmdBindPipeline(vk_cmd_buffer,
829 VK_PIPELINE_BIND_POINT_COMPUTE,
830 device->queries.avail_pipeline);
831
832 v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
833 VK_PIPELINE_BIND_POINT_COMPUTE,
834 device->queries.avail_pipeline_layout,
835 0, 1, &pool->meta.descriptor_set,
836 0, NULL);
837
838 struct {
839 uint32_t offset;
840 uint32_t query;
841 uint8_t availability;
842 } push_data = { pool->occlusion.avail_offset, query, availability };
843 v3dv_CmdPushConstants(vk_cmd_buffer,
844 device->queries.avail_pipeline_layout,
845 VK_SHADER_STAGE_COMPUTE_BIT,
846 0, sizeof(push_data), &push_data);
847 cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
848
849 v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
850 }
851
852 static void
cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count)853 cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
854 struct v3dv_query_pool *pool,
855 uint32_t query, uint32_t count)
856 {
857 struct v3dv_device *device = cmd_buffer->device;
858 VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
859
860 /* Ensure the GPU is done with the queries in the graphics queue before
861 * we reset in the compute queue.
862 */
863 VkMemoryBarrier2 barrier = {
864 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
865 .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
866 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
867 };
868 VkDependencyInfo barrier_info = {
869 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
870 .memoryBarrierCount = 1,
871 .pMemoryBarriers = &barrier,
872 };
873 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
874
875 /* Emit compute reset */
876 v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
877
878 v3dv_CmdBindPipeline(vk_cmd_buffer,
879 VK_PIPELINE_BIND_POINT_COMPUTE,
880 device->queries.reset_occlusion_pipeline);
881
882 v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
883 VK_PIPELINE_BIND_POINT_COMPUTE,
884 device->queries.reset_occlusion_pipeline_layout,
885 0, 1, &pool->meta.descriptor_set,
886 0, NULL);
887 struct {
888 uint32_t offset;
889 uint32_t query;
890 } push_data = { pool->occlusion.avail_offset, query };
891 v3dv_CmdPushConstants(vk_cmd_buffer,
892 device->queries.reset_occlusion_pipeline_layout,
893 VK_SHADER_STAGE_COMPUTE_BIT,
894 0, sizeof(push_data), &push_data);
895
896 cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
897
898 v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
899
900 /* Ensure future work in the graphics queue using the queries doesn't start
901 * before the reset completed.
902 */
903 barrier = (VkMemoryBarrier2) {
904 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
905 .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
906 .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT,
907 };
908 barrier_info = (VkDependencyInfo) {
909 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
910 .memoryBarrierCount = 1,
911 .pMemoryBarriers = &barrier,
912 };
913 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
914 }
915
916 static void
cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)917 cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
918 struct v3dv_query_pool *pool,
919 uint32_t first, uint32_t count)
920 {
921 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
922 cmd_buffer_emit_reset_occlusion_query_pool(cmd_buffer, pool, first, count);
923 }
924
925 static void
cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)926 cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer *cmd_buffer,
927 struct v3dv_query_pool *pool,
928 uint32_t first, uint32_t count)
929 {
930 assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION);
931
932 struct v3dv_job *job =
933 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
934 V3DV_JOB_TYPE_CPU_RESET_QUERIES,
935 cmd_buffer, -1);
936 v3dv_return_if_oom(cmd_buffer, NULL);
937 job->cpu.query_reset.pool = pool;
938 job->cpu.query_reset.first = first;
939 job->cpu.query_reset.count = count;
940 list_addtail(&job->list_link, &cmd_buffer->jobs);
941 }
942
943 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)944 v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
945 VkQueryPool queryPool,
946 uint32_t firstQuery,
947 uint32_t queryCount)
948 {
949 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
950 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
951
952 /* Resets can only happen outside a render pass instance so we should not
953 * be in the middle of job recording.
954 */
955 assert(cmd_buffer->state.pass == NULL);
956 assert(cmd_buffer->state.job == NULL);
957
958 assert(firstQuery < pool->query_count);
959 assert(firstQuery + queryCount <= pool->query_count);
960
961 /* We can reset occlusion queries in the GPU, but for other query types
962 * we emit a CPU job that will call v3dv_reset_query_pool_cpu when executed
963 * in the queue.
964 */
965 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
966 cmd_buffer_emit_reset_query_pool(cmd_buffer, pool, firstQuery, queryCount);
967 } else {
968 cmd_buffer_emit_reset_query_pool_cpu(cmd_buffer, pool,
969 firstQuery, queryCount);
970 }
971 }
972
973 /**
974 * Creates a descriptor pool so we can create a descriptors for the destination
975 * buffers of vkCmdCopyQueryResults for queries where this is implemented in
976 * the GPU.
977 */
978 static VkResult
create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)979 create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
980 {
981 /* If this is not the first pool we create one for this command buffer
982 * size it based on the size of the currently exhausted pool.
983 */
984 uint32_t descriptor_count = 32;
985 if (cmd_buffer->meta.query.dspool != VK_NULL_HANDLE) {
986 struct v3dv_descriptor_pool *exhausted_pool =
987 v3dv_descriptor_pool_from_handle(cmd_buffer->meta.query.dspool);
988 descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
989 }
990
991 /* Create the descriptor pool */
992 cmd_buffer->meta.query.dspool = VK_NULL_HANDLE;
993 VkDescriptorPoolSize pool_size = {
994 .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
995 .descriptorCount = descriptor_count,
996 };
997 VkDescriptorPoolCreateInfo info = {
998 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
999 .maxSets = descriptor_count,
1000 .poolSizeCount = 1,
1001 .pPoolSizes = &pool_size,
1002 .flags = 0,
1003 };
1004 VkResult result =
1005 v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
1006 &info,
1007 &cmd_buffer->device->vk.alloc,
1008 &cmd_buffer->meta.query.dspool);
1009
1010 if (result == VK_SUCCESS) {
1011 assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
1012 const VkDescriptorPool vk_pool = cmd_buffer->meta.query.dspool;
1013
1014 v3dv_cmd_buffer_add_private_obj(
1015 cmd_buffer, (uintptr_t) vk_pool,
1016 (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
1017
1018 struct v3dv_descriptor_pool *pool =
1019 v3dv_descriptor_pool_from_handle(vk_pool);
1020 pool->is_driver_internal = true;
1021 }
1022
1023 return result;
1024 }
1025
1026 static VkResult
allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)1027 allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
1028 VkDescriptorSet *set)
1029 {
1030 /* Make sure we have a descriptor pool */
1031 VkResult result;
1032 if (cmd_buffer->meta.query.dspool == VK_NULL_HANDLE) {
1033 result = create_storage_buffer_descriptor_pool(cmd_buffer);
1034 if (result != VK_SUCCESS)
1035 return result;
1036 }
1037 assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
1038
1039 /* Allocate descriptor set */
1040 struct v3dv_device *device = cmd_buffer->device;
1041 VkDevice vk_device = v3dv_device_to_handle(device);
1042 VkDescriptorSetAllocateInfo info = {
1043 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
1044 .descriptorPool = cmd_buffer->meta.query.dspool,
1045 .descriptorSetCount = 1,
1046 .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1047 };
1048 result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
1049
1050 /* If we ran out of pool space, grow the pool and try again */
1051 if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
1052 result = create_storage_buffer_descriptor_pool(cmd_buffer);
1053 if (result == VK_SUCCESS) {
1054 info.descriptorPool = cmd_buffer->meta.query.dspool;
1055 result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
1056 }
1057 }
1058
1059 return result;
1060 }
1061
1062 static uint32_t
copy_pipeline_index_from_flags(VkQueryResultFlags flags)1063 copy_pipeline_index_from_flags(VkQueryResultFlags flags)
1064 {
1065 uint32_t index = 0;
1066 if (flags & VK_QUERY_RESULT_64_BIT)
1067 index |= 1;
1068 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
1069 index |= 2;
1070 if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
1071 index |= 4;
1072 assert(index < 8);
1073 return index;
1074 }
1075
1076 static nir_shader *
1077 get_copy_query_results_cs(VkQueryResultFlags flags);
1078
1079 static void
cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * buf,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)1080 cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer *cmd_buffer,
1081 struct v3dv_query_pool *pool,
1082 uint32_t first, uint32_t count,
1083 struct v3dv_buffer *buf,
1084 uint32_t offset, uint32_t stride,
1085 VkQueryResultFlags flags)
1086 {
1087 struct v3dv_device *device = cmd_buffer->device;
1088 VkDevice vk_device = v3dv_device_to_handle(device);
1089 VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
1090
1091 /* Create the required copy pipeline if not yet created */
1092 uint32_t pipeline_idx = copy_pipeline_index_from_flags(flags);
1093 if (!device->queries.copy_pipeline[pipeline_idx]) {
1094 nir_shader *copy_query_results_cs_nir = get_copy_query_results_cs(flags);
1095 VkResult result =
1096 v3dv_create_compute_pipeline_from_nir(
1097 device, copy_query_results_cs_nir,
1098 device->queries.copy_pipeline_layout,
1099 &device->queries.copy_pipeline[pipeline_idx]);
1100 ralloc_free(copy_query_results_cs_nir);
1101 if (result != VK_SUCCESS) {
1102 fprintf(stderr, "Failed to create copy query results pipeline\n");
1103 return;
1104 }
1105 }
1106
1107 /* FIXME: do we need this barrier? Since vkCmdEndQuery should've been called
1108 * and that already waits maybe we don't (since this is serialized
1109 * in the compute queue with EndQuery anyway).
1110 */
1111 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1112 VkMemoryBarrier2 barrier = {
1113 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
1114 .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
1115 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1116 };
1117 VkDependencyInfo barrier_info = {
1118 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1119 .memoryBarrierCount = 1,
1120 .pMemoryBarriers = &barrier,
1121 };
1122 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
1123 }
1124
1125 /* Allocate and setup descriptor set for output buffer */
1126 VkDescriptorSet out_buf_descriptor_set;
1127 VkResult result =
1128 allocate_storage_buffer_descriptor_set(cmd_buffer,
1129 &out_buf_descriptor_set);
1130 if (result != VK_SUCCESS) {
1131 fprintf(stderr, "vkCmdCopyQueryPoolResults failed: "
1132 "could not allocate descriptor.\n");
1133 return;
1134 }
1135
1136 VkDescriptorBufferInfo desc_buf_info = {
1137 .buffer = v3dv_buffer_to_handle(buf),
1138 .offset = 0,
1139 .range = VK_WHOLE_SIZE,
1140 };
1141 VkWriteDescriptorSet write = {
1142 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1143 .dstSet = out_buf_descriptor_set,
1144 .dstBinding = 0,
1145 .dstArrayElement = 0,
1146 .descriptorCount = 1,
1147 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1148 .pBufferInfo = &desc_buf_info,
1149 };
1150 v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
1151
1152 /* Dispatch copy */
1153 v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
1154
1155 assert(device->queries.copy_pipeline[pipeline_idx]);
1156 v3dv_CmdBindPipeline(vk_cmd_buffer,
1157 VK_PIPELINE_BIND_POINT_COMPUTE,
1158 device->queries.copy_pipeline[pipeline_idx]);
1159
1160 VkDescriptorSet sets[2] = {
1161 pool->meta.descriptor_set,
1162 out_buf_descriptor_set,
1163 };
1164 v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
1165 VK_PIPELINE_BIND_POINT_COMPUTE,
1166 device->queries.copy_pipeline_layout,
1167 0, 2, sets, 0, NULL);
1168
1169 struct {
1170 uint32_t avail_offset, first, offset, stride, flags;
1171 } push_data = { pool->occlusion.avail_offset, first, offset, stride, flags };
1172 v3dv_CmdPushConstants(vk_cmd_buffer,
1173 device->queries.copy_pipeline_layout,
1174 VK_SHADER_STAGE_COMPUTE_BIT,
1175 0, sizeof(push_data), &push_data);
1176
1177 cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
1178
1179 v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
1180 }
1181
1182 static void
cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * dst,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)1183 cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer *cmd_buffer,
1184 struct v3dv_query_pool *pool,
1185 uint32_t first,
1186 uint32_t count,
1187 struct v3dv_buffer *dst,
1188 uint32_t offset,
1189 uint32_t stride,
1190 VkQueryResultFlags flags)
1191 {
1192 struct v3dv_job *job =
1193 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
1194 V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
1195 cmd_buffer, -1);
1196 v3dv_return_if_oom(cmd_buffer, NULL);
1197
1198 job->cpu.query_copy_results.pool = pool;
1199 job->cpu.query_copy_results.first = first;
1200 job->cpu.query_copy_results.count = count;
1201 job->cpu.query_copy_results.dst = dst;
1202 job->cpu.query_copy_results.offset = offset;
1203 job->cpu.query_copy_results.stride = stride;
1204 job->cpu.query_copy_results.flags = flags;
1205
1206 list_addtail(&job->list_link, &cmd_buffer->jobs);
1207 }
1208
1209 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1210 v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
1211 VkQueryPool queryPool,
1212 uint32_t firstQuery,
1213 uint32_t queryCount,
1214 VkBuffer dstBuffer,
1215 VkDeviceSize dstOffset,
1216 VkDeviceSize stride,
1217 VkQueryResultFlags flags)
1218 {
1219 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1220 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1221 V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);
1222
1223 /* Copies can only happen outside a render pass instance so we should not
1224 * be in the middle of job recording.
1225 */
1226 assert(cmd_buffer->state.pass == NULL);
1227 assert(cmd_buffer->state.job == NULL);
1228
1229 assert(firstQuery < pool->query_count);
1230 assert(firstQuery + queryCount <= pool->query_count);
1231
1232 /* For occlusion queries we implement the copy in the GPU but for other
1233 * queries we emit a CPU job that will call v3dv_get_query_pool_results_cpu
1234 * when executed in the queue.
1235 */
1236 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
1237 cmd_buffer_emit_copy_query_pool_results(cmd_buffer, pool,
1238 firstQuery, queryCount,
1239 dst, (uint32_t) dstOffset,
1240 (uint32_t) stride, flags);
1241 } else {
1242 cmd_buffer_emit_copy_query_pool_results_cpu(cmd_buffer, pool,
1243 firstQuery, queryCount,
1244 dst, (uint32_t)dstOffset,
1245 (uint32_t) stride, flags);
1246 }
1247 }
1248
1249 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1250 v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
1251 VkQueryPool queryPool,
1252 uint32_t query,
1253 VkQueryControlFlags flags)
1254 {
1255 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1256 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1257
1258 v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags);
1259 }
1260
1261 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1262 v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
1263 VkQueryPool queryPool,
1264 uint32_t query)
1265 {
1266 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1267 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1268
1269 v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
1270 }
1271
1272 void
v3dv_reset_query_pool_cpu(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)1273 v3dv_reset_query_pool_cpu(struct v3dv_device *device,
1274 struct v3dv_query_pool *pool,
1275 uint32_t first,
1276 uint32_t count)
1277 {
1278 mtx_lock(&device->query_mutex);
1279
1280 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
1281 assert(first + count <= pool->query_count);
1282
1283 /* Reset timestamp */
1284 uint8_t *base_addr;
1285 base_addr = ((uint8_t *) pool->timestamp.bo->map) +
1286 pool->queries[first].timestamp.offset;
1287 memset(base_addr, 0, 8 * count);
1288
1289 for (uint32_t i = first; i < first + count; i++) {
1290 if (vk_sync_reset(&device->vk, pool->queries[i].timestamp.sync) != VK_SUCCESS)
1291 fprintf(stderr, "Failed to reset sync");
1292 }
1293
1294 mtx_unlock(&device->query_mutex);
1295 return;
1296 }
1297
1298 for (uint32_t i = first; i < first + count; i++) {
1299 assert(i < pool->query_count);
1300 struct v3dv_query *q = &pool->queries[i];
1301 q->maybe_available = false;
1302 switch (pool->query_type) {
1303 case VK_QUERY_TYPE_OCCLUSION: {
1304 /* Reset availability */
1305 uint8_t *base_addr = ((uint8_t *) pool->occlusion.bo->map) +
1306 pool->occlusion.avail_offset + first;
1307 memset(base_addr, 0, count);
1308
1309 /* Reset occlusion counter */
1310 const uint8_t *q_addr =
1311 ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
1312 uint32_t *counter = (uint32_t *) q_addr;
1313 *counter = 0;
1314 break;
1315 }
1316 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1317 kperfmon_destroy(device, pool, i);
1318 kperfmon_create(device, pool, i);
1319 if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
1320 fprintf(stderr, "Failed to reset sync");
1321 break;
1322 default:
1323 unreachable("Unsupported query type");
1324 }
1325 }
1326
1327 mtx_unlock(&device->query_mutex);
1328 }
1329
1330 VKAPI_ATTR void VKAPI_CALL
v3dv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1331 v3dv_ResetQueryPool(VkDevice _device,
1332 VkQueryPool queryPool,
1333 uint32_t firstQuery,
1334 uint32_t queryCount)
1335 {
1336 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1337 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1338
1339 v3dv_reset_query_pool_cpu(device, pool, firstQuery, queryCount);
1340 }
1341
1342 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1343 v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1344 VkPhysicalDevice physicalDevice,
1345 uint32_t queueFamilyIndex,
1346 uint32_t *pCounterCount,
1347 VkPerformanceCounterKHR *pCounters,
1348 VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
1349 {
1350 V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
1351
1352 return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
1353 pCounters,
1354 pCounterDescriptions);
1355 }
1356
1357 VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1358 v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1359 VkPhysicalDevice physicalDevice,
1360 const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
1361 uint32_t *pNumPasses)
1362 {
1363 *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
1364 DRM_V3D_MAX_PERF_COUNTERS);
1365 }
1366
1367 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AcquireProfilingLockKHR(VkDevice _device,const VkAcquireProfilingLockInfoKHR * pInfo)1368 v3dv_AcquireProfilingLockKHR(
1369 VkDevice _device,
1370 const VkAcquireProfilingLockInfoKHR *pInfo)
1371 {
1372 return VK_SUCCESS;
1373 }
1374
1375 VKAPI_ATTR void VKAPI_CALL
v3dv_ReleaseProfilingLockKHR(VkDevice device)1376 v3dv_ReleaseProfilingLockKHR(VkDevice device)
1377 {
1378 }
1379
1380 static inline void
nir_set_query_availability(nir_builder * b,nir_def * buf,nir_def * offset,nir_def * query_idx,nir_def * avail)1381 nir_set_query_availability(nir_builder *b,
1382 nir_def *buf,
1383 nir_def *offset,
1384 nir_def *query_idx,
1385 nir_def *avail)
1386 {
1387 offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
1388 nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1);
1389 }
1390
1391 static inline nir_def *
nir_get_query_availability(nir_builder * b,nir_def * buf,nir_def * offset,nir_def * query_idx)1392 nir_get_query_availability(nir_builder *b,
1393 nir_def *buf,
1394 nir_def *offset,
1395 nir_def *query_idx)
1396 {
1397 offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
1398 nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1);
1399 return nir_i2i32(b, avail);
1400 }
1401
1402 static nir_shader *
get_set_query_availability_cs()1403 get_set_query_availability_cs()
1404 {
1405 const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1406 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1407 "set query availability cs");
1408
1409 nir_def *buf =
1410 nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1411 .desc_set = 0,
1412 .binding = 0,
1413 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1414
1415 /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1416 * ever change any of these parameters we need to update how we compute the
1417 * query index here.
1418 */
1419 nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1420
1421 nir_def *offset =
1422 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1423
1424 nir_def *query_idx =
1425 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1426
1427 nir_def *avail =
1428 nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1);
1429
1430 query_idx = nir_iadd(&b, query_idx, wg_id);
1431 nir_set_query_availability(&b, buf, offset, query_idx, avail);
1432
1433 return b.shader;
1434 }
1435
1436 static inline nir_def *
nir_get_occlusion_counter_offset(nir_builder * b,nir_def * query_idx)1437 nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx)
1438 {
1439 nir_def *query_group = nir_udiv_imm(b, query_idx, 16);
1440 nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16);
1441 nir_def *offset =
1442 nir_iadd(b, nir_imul_imm(b, query_group, 1024),
1443 nir_imul_imm(b, query_group_offset, 4));
1444 return offset;
1445 }
1446
1447 static inline void
nir_reset_occlusion_counter(nir_builder * b,nir_def * buf,nir_def * query_idx)1448 nir_reset_occlusion_counter(nir_builder *b,
1449 nir_def *buf,
1450 nir_def *query_idx)
1451 {
1452 nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
1453 nir_def *zero = nir_imm_int(b, 0);
1454 nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4);
1455 }
1456
1457 static inline nir_def *
nir_read_occlusion_counter(nir_builder * b,nir_def * buf,nir_def * query_idx)1458 nir_read_occlusion_counter(nir_builder *b,
1459 nir_def *buf,
1460 nir_def *query_idx)
1461 {
1462 nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
1463 return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4);
1464 }
1465
1466 static nir_shader *
get_reset_occlusion_query_cs()1467 get_reset_occlusion_query_cs()
1468 {
1469 const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1470 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1471 "reset occlusion query cs");
1472
1473 nir_def *buf =
1474 nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1475 .desc_set = 0,
1476 .binding = 0,
1477 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1478
1479 /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1480 * ever change any of these parameters we need to update how we compute the
1481 * query index here.
1482 */
1483 nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1484
1485 nir_def *avail_offset =
1486 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1487
1488 nir_def *base_query_idx =
1489 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1490
1491 nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
1492
1493 nir_set_query_availability(&b, buf, avail_offset, query_idx,
1494 nir_imm_intN_t(&b, 0, 8));
1495 nir_reset_occlusion_counter(&b, buf, query_idx);
1496
1497 return b.shader;
1498 }
1499
1500 static void
write_query_buffer(nir_builder * b,nir_def * buf,nir_def ** offset,nir_def * value,bool flag_64bit)1501 write_query_buffer(nir_builder *b,
1502 nir_def *buf,
1503 nir_def **offset,
1504 nir_def *value,
1505 bool flag_64bit)
1506 {
1507 if (flag_64bit) {
1508 /* Create a 64-bit value using a vec2 with the .Y component set to 0
1509 * so we can write a 64-bit value in a single store.
1510 */
1511 nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0));
1512 nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8);
1513 *offset = nir_iadd_imm(b, *offset, 8);
1514 } else {
1515 nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4);
1516 *offset = nir_iadd_imm(b, *offset, 4);
1517 }
1518 }
1519
1520 static nir_shader *
get_copy_query_results_cs(VkQueryResultFlags flags)1521 get_copy_query_results_cs(VkQueryResultFlags flags)
1522 {
1523 bool flag_64bit = flags & VK_QUERY_RESULT_64_BIT;
1524 bool flag_avail = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
1525 bool flag_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
1526
1527 const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1528 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1529 "copy query results cs");
1530
1531 nir_def *buf =
1532 nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1533 .desc_set = 0,
1534 .binding = 0,
1535 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1536
1537 nir_def *buf_out =
1538 nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1539 .desc_set = 1,
1540 .binding = 0,
1541 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1542
1543 /* Read push constants */
1544 nir_def *avail_offset =
1545 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1546
1547 nir_def *base_query_idx =
1548 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1549
1550 nir_def *base_offset_out =
1551 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4);
1552
1553 nir_def *stride =
1554 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4);
1555
1556 /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1557 * ever change any of these parameters we need to update how we compute the
1558 * query index here.
1559 */
1560 nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1561 nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
1562
1563 /* Read query availability if needed */
1564 nir_def *avail = NULL;
1565 if (flag_avail || !flag_partial)
1566 avail = nir_get_query_availability(&b, buf, avail_offset, query_idx);
1567
1568 /* Write occusion query result... */
1569 nir_def *offset =
1570 nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride));
1571
1572 /* ...if partial is requested, we always write */
1573 if(flag_partial) {
1574 nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
1575 write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
1576 } else {
1577 /*...otherwise, we only write if the query is available */
1578 nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0));
1579 nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
1580 write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
1581 nir_pop_if(&b, if_stmt);
1582 }
1583
1584 /* Write query availability */
1585 if (flag_avail)
1586 write_query_buffer(&b, buf_out, &offset, avail, flag_64bit);
1587
1588 return b.shader;
1589 }
1590
1591 static bool
create_query_pipelines(struct v3dv_device * device)1592 create_query_pipelines(struct v3dv_device *device)
1593 {
1594 VkResult result;
1595 VkPipeline pipeline;
1596
1597 /* Set layout: single storage buffer */
1598 if (!device->queries.buf_descriptor_set_layout) {
1599 VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
1600 .binding = 0,
1601 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1602 .descriptorCount = 1,
1603 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1604 };
1605 VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
1606 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
1607 .bindingCount = 1,
1608 .pBindings = &descriptor_set_layout_binding,
1609 };
1610 result =
1611 v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
1612 &descriptor_set_layout_info,
1613 &device->vk.alloc,
1614 &device->queries.buf_descriptor_set_layout);
1615 if (result != VK_SUCCESS)
1616 return false;
1617 }
1618
1619 /* Set availability pipeline.
1620 *
1621 * Pipeline layout:
1622 * - 1 storage buffer for the BO with the query availability.
1623 * - 2 push constants:
1624 * 0B: offset of the availability info in the buffer (4 bytes)
1625 * 4B: base query index (4 bytes).
1626 * 8B: availability (1 byte).
1627 */
1628 if (!device->queries.avail_pipeline_layout) {
1629 VkPipelineLayoutCreateInfo pipeline_layout_info = {
1630 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1631 .setLayoutCount = 1,
1632 .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1633 .pushConstantRangeCount = 1,
1634 .pPushConstantRanges =
1635 &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 9 },
1636 };
1637
1638 result =
1639 v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1640 &pipeline_layout_info,
1641 &device->vk.alloc,
1642 &device->queries.avail_pipeline_layout);
1643
1644 if (result != VK_SUCCESS)
1645 return false;
1646 }
1647
1648 if (!device->queries.avail_pipeline) {
1649 nir_shader *set_query_availability_cs_nir = get_set_query_availability_cs();
1650 result = v3dv_create_compute_pipeline_from_nir(device,
1651 set_query_availability_cs_nir,
1652 device->queries.avail_pipeline_layout,
1653 &pipeline);
1654 ralloc_free(set_query_availability_cs_nir);
1655 if (result != VK_SUCCESS)
1656 return false;
1657
1658 device->queries.avail_pipeline = pipeline;
1659 }
1660
1661 /* Reset occlusion query pipeline.
1662 *
1663 * Pipeline layout:
1664 * - 1 storage buffer for the BO with the occlusion and availability data.
1665 * - Push constants:
1666 * 0B: offset of the availability info in the buffer (4B)
1667 * 4B: base query index (4B)
1668 */
1669 if (!device->queries.reset_occlusion_pipeline_layout) {
1670 VkPipelineLayoutCreateInfo pipeline_layout_info = {
1671 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1672 .setLayoutCount = 1,
1673 .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1674 .pushConstantRangeCount = 1,
1675 .pPushConstantRanges =
1676 &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 8 },
1677 };
1678
1679 result =
1680 v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1681 &pipeline_layout_info,
1682 &device->vk.alloc,
1683 &device->queries.reset_occlusion_pipeline_layout);
1684
1685 if (result != VK_SUCCESS)
1686 return false;
1687 }
1688
1689 if (!device->queries.reset_occlusion_pipeline) {
1690 nir_shader *reset_occlusion_query_cs_nir = get_reset_occlusion_query_cs();
1691 result = v3dv_create_compute_pipeline_from_nir(
1692 device,
1693 reset_occlusion_query_cs_nir,
1694 device->queries.reset_occlusion_pipeline_layout,
1695 &pipeline);
1696 ralloc_free(reset_occlusion_query_cs_nir);
1697 if (result != VK_SUCCESS)
1698 return false;
1699
1700 device->queries.reset_occlusion_pipeline = pipeline;
1701 }
1702
1703 /* Copy query results pipelines.
1704 *
1705 * Pipeline layout:
1706 * - 1 storage buffer for the BO with the query availability and occlusion.
1707 * - 1 storage buffer for the output.
1708 * - Push constants:
1709 * 0B: offset of the availability info in the buffer (4B)
1710 * 4B: base query index (4B)
1711 * 8B: offset into output buffer (4B)
1712 * 12B: stride (4B)
1713 *
1714 * We create multiple specialized pipelines depending on the copy flags
1715 * to remove conditionals from the copy shader and get more optimized
1716 * pipelines.
1717 */
1718 if (!device->queries.copy_pipeline_layout) {
1719 VkDescriptorSetLayout set_layouts[2] = {
1720 device->queries.buf_descriptor_set_layout,
1721 device->queries.buf_descriptor_set_layout
1722 };
1723 VkPipelineLayoutCreateInfo pipeline_layout_info = {
1724 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1725 .setLayoutCount = 2,
1726 .pSetLayouts = set_layouts,
1727 .pushConstantRangeCount = 1,
1728 .pPushConstantRanges =
1729 &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 16 },
1730 };
1731
1732 result =
1733 v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1734 &pipeline_layout_info,
1735 &device->vk.alloc,
1736 &device->queries.copy_pipeline_layout);
1737
1738 if (result != VK_SUCCESS)
1739 return false;
1740 }
1741
1742 /* Actual copy pipelines are created lazily on demand since there can be up
1743 * to 8 depending on the flags used, however it is likely that applications
1744 * will use the same flags every time and only one pipeline is required.
1745 */
1746
1747 return true;
1748 }
1749
1750 static void
destroy_query_pipelines(struct v3dv_device * device)1751 destroy_query_pipelines(struct v3dv_device *device)
1752 {
1753 VkDevice _device = v3dv_device_to_handle(device);
1754
1755 /* Availability pipeline */
1756 v3dv_DestroyPipeline(_device, device->queries.avail_pipeline,
1757 &device->vk.alloc);
1758 device->queries.avail_pipeline = VK_NULL_HANDLE;
1759 v3dv_DestroyPipelineLayout(_device, device->queries.avail_pipeline_layout,
1760 &device->vk.alloc);
1761 device->queries.avail_pipeline_layout = VK_NULL_HANDLE;
1762
1763 /* Reset occlusion pipeline */
1764 v3dv_DestroyPipeline(_device, device->queries.reset_occlusion_pipeline,
1765 &device->vk.alloc);
1766 device->queries.reset_occlusion_pipeline = VK_NULL_HANDLE;
1767 v3dv_DestroyPipelineLayout(_device,
1768 device->queries.reset_occlusion_pipeline_layout,
1769 &device->vk.alloc);
1770 device->queries.reset_occlusion_pipeline_layout = VK_NULL_HANDLE;
1771
1772 /* Copy pipelines */
1773 for (int i = 0; i < 8; i++) {
1774 v3dv_DestroyPipeline(_device, device->queries.copy_pipeline[i],
1775 &device->vk.alloc);
1776 device->queries.copy_pipeline[i] = VK_NULL_HANDLE;
1777 }
1778 v3dv_DestroyPipelineLayout(_device, device->queries.copy_pipeline_layout,
1779 &device->vk.alloc);
1780 device->queries.copy_pipeline_layout = VK_NULL_HANDLE;
1781
1782 v3dv_DestroyDescriptorSetLayout(_device,
1783 device->queries.buf_descriptor_set_layout,
1784 &device->vk.alloc);
1785 device->queries.buf_descriptor_set_layout = VK_NULL_HANDLE;
1786 }
1787
1788 /**
1789 * Allocates device resources for implementing certain types of queries.
1790 */
1791 VkResult
v3dv_query_allocate_resources(struct v3dv_device * device)1792 v3dv_query_allocate_resources(struct v3dv_device *device)
1793 {
1794 if (!create_query_pipelines(device))
1795 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1796
1797 return VK_SUCCESS;
1798 }
1799
1800 void
v3dv_query_free_resources(struct v3dv_device * device)1801 v3dv_query_free_resources(struct v3dv_device *device)
1802 {
1803 destroy_query_pipelines(device);
1804 }
1805