1 /*
2 * Copyright © 2020 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25
26 #include "util/timespec.h"
27 #include "compiler/nir/nir_builder.h"
28
29 static void
kperfmon_create(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query)30 kperfmon_create(struct v3dv_device *device,
31 struct v3dv_query_pool *pool,
32 uint32_t query)
33 {
34 for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
35 assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
36
37 struct drm_v3d_perfmon_create req = {
38 .ncounters = MIN2(pool->perfmon.ncounters -
39 i * DRM_V3D_MAX_PERF_COUNTERS,
40 DRM_V3D_MAX_PERF_COUNTERS),
41 };
42 memcpy(req.counters,
43 &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
44 req.ncounters);
45
46 int ret = v3d_ioctl(device->pdevice->render_fd,
47 DRM_IOCTL_V3D_PERFMON_CREATE,
48 &req);
49 if (ret)
50 mesa_loge("Failed to create perfmon for query %d: %s\n", query,
51 strerror(errno));
52
53 pool->queries[query].perf.kperfmon_ids[i] = req.id;
54 }
55 }
56
57 static void
kperfmon_destroy(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query)58 kperfmon_destroy(struct v3dv_device *device,
59 struct v3dv_query_pool *pool,
60 uint32_t query)
61 {
62 /* Skip destroying if never created */
63 if (!pool->queries[query].perf.kperfmon_ids[0])
64 return;
65
66 for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
67 struct drm_v3d_perfmon_destroy req = {
68 .id = pool->queries[query].perf.kperfmon_ids[i]
69 };
70
71 int ret = v3d_ioctl(device->pdevice->render_fd,
72 DRM_IOCTL_V3D_PERFMON_DESTROY,
73 &req);
74
75 if (ret) {
76 mesa_loge("Failed to destroy perfmon %u: %s\n",
77 req.id, strerror(errno));
78 }
79 }
80 }
81
82 /**
83 * Creates a VkBuffer (and VkDeviceMemory) to access a BO.
84 */
85 static VkResult
create_vk_storage_buffer(struct v3dv_device * device,struct v3dv_bo * bo,VkBuffer * vk_buf,VkDeviceMemory * vk_mem)86 create_vk_storage_buffer(struct v3dv_device *device,
87 struct v3dv_bo *bo,
88 VkBuffer *vk_buf,
89 VkDeviceMemory *vk_mem)
90 {
91 VkDevice vk_device = v3dv_device_to_handle(device);
92
93 VkBufferCreateInfo buf_info = {
94 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
95 .size = bo->size,
96 .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
97 };
98 VkResult result = v3dv_CreateBuffer(vk_device, &buf_info, NULL, vk_buf);
99 if (result != VK_SUCCESS)
100 return result;
101
102 struct v3dv_device_memory *mem =
103 vk_object_zalloc(&device->vk, NULL, sizeof(*mem),
104 VK_OBJECT_TYPE_DEVICE_MEMORY);
105 if (!mem)
106 return VK_ERROR_OUT_OF_HOST_MEMORY;
107
108 mem->bo = bo;
109 mem->type = &device->pdevice->memory.memoryTypes[0];
110
111 *vk_mem = v3dv_device_memory_to_handle(mem);
112 VkBindBufferMemoryInfo bind_info = {
113 .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
114 .buffer = *vk_buf,
115 .memory = *vk_mem,
116 .memoryOffset = 0,
117 };
118 v3dv_BindBufferMemory2(vk_device, 1, &bind_info);
119
120 return VK_SUCCESS;
121 }
122
123 static void
destroy_vk_storage_buffer(struct v3dv_device * device,VkBuffer * vk_buf,VkDeviceMemory * vk_mem)124 destroy_vk_storage_buffer(struct v3dv_device *device,
125 VkBuffer *vk_buf,
126 VkDeviceMemory *vk_mem)
127 {
128 if (*vk_mem) {
129 vk_object_free(&device->vk, NULL, v3dv_device_memory_from_handle(*vk_mem));
130 *vk_mem = VK_NULL_HANDLE;
131 }
132
133 v3dv_DestroyBuffer(v3dv_device_to_handle(device), *vk_buf, NULL);
134 *vk_buf = VK_NULL_HANDLE;
135 }
136
137 /**
138 * Allocates descriptor sets to access query pool BO (availability and
139 * occlusion query results) from Vulkan pipelines.
140 */
141 static VkResult
create_pool_descriptors(struct v3dv_device * device,struct v3dv_query_pool * pool)142 create_pool_descriptors(struct v3dv_device *device,
143 struct v3dv_query_pool *pool)
144 {
145 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
146 VkDevice vk_device = v3dv_device_to_handle(device);
147
148 VkDescriptorPoolSize pool_size = {
149 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
150 .descriptorCount = 1,
151 };
152 VkDescriptorPoolCreateInfo pool_info = {
153 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
154 .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
155 .maxSets = 1,
156 .poolSizeCount = 1,
157 .pPoolSizes = &pool_size,
158 };
159 VkResult result =
160 v3dv_CreateDescriptorPool(vk_device, &pool_info, NULL,
161 &pool->meta.descriptor_pool);
162
163 if (result != VK_SUCCESS)
164 return result;
165
166 VkDescriptorSetAllocateInfo alloc_info = {
167 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
168 .descriptorPool = pool->meta.descriptor_pool,
169 .descriptorSetCount = 1,
170 .pSetLayouts = &device->queries.buf_descriptor_set_layout,
171 };
172 result = v3dv_AllocateDescriptorSets(vk_device, &alloc_info,
173 &pool->meta.descriptor_set);
174 if (result != VK_SUCCESS)
175 return result;
176
177 VkDescriptorBufferInfo desc_buf_info = {
178 .buffer = pool->meta.buf,
179 .offset = 0,
180 .range = VK_WHOLE_SIZE,
181 };
182
183 VkWriteDescriptorSet write = {
184 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
185 .dstSet = pool->meta.descriptor_set,
186 .dstBinding = 0,
187 .dstArrayElement = 0,
188 .descriptorCount = 1,
189 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
190 .pBufferInfo = &desc_buf_info,
191 };
192 v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
193
194 return VK_SUCCESS;
195 }
196
197 static void
destroy_pool_descriptors(struct v3dv_device * device,struct v3dv_query_pool * pool)198 destroy_pool_descriptors(struct v3dv_device *device,
199 struct v3dv_query_pool *pool)
200 {
201 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
202
203 v3dv_FreeDescriptorSets(v3dv_device_to_handle(device),
204 pool->meta.descriptor_pool,
205 1, &pool->meta.descriptor_set);
206 pool->meta.descriptor_set = VK_NULL_HANDLE;
207
208 v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device),
209 pool->meta.descriptor_pool, NULL);
210 pool->meta.descriptor_pool = VK_NULL_HANDLE;
211 }
212
213 static VkResult
pool_create_meta_resources(struct v3dv_device * device,struct v3dv_query_pool * pool)214 pool_create_meta_resources(struct v3dv_device *device,
215 struct v3dv_query_pool *pool)
216 {
217 VkResult result;
218
219 if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
220 return VK_SUCCESS;
221
222 result = create_vk_storage_buffer(device, pool->occlusion.bo,
223 &pool->meta.buf, &pool->meta.mem);
224 if (result != VK_SUCCESS)
225 return result;
226
227 result = create_pool_descriptors(device, pool);
228 if (result != VK_SUCCESS)
229 return result;
230
231 return VK_SUCCESS;
232 }
233
234 static void
pool_destroy_meta_resources(struct v3dv_device * device,struct v3dv_query_pool * pool)235 pool_destroy_meta_resources(struct v3dv_device *device,
236 struct v3dv_query_pool *pool)
237 {
238 if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
239 return;
240
241 destroy_pool_descriptors(device, pool);
242 destroy_vk_storage_buffer(device, &pool->meta.buf, &pool->meta.mem);
243 }
244
245 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)246 v3dv_CreateQueryPool(VkDevice _device,
247 const VkQueryPoolCreateInfo *pCreateInfo,
248 const VkAllocationCallbacks *pAllocator,
249 VkQueryPool *pQueryPool)
250 {
251 V3DV_FROM_HANDLE(v3dv_device, device, _device);
252
253 assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
254 pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
255 pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
256 assert(pCreateInfo->queryCount > 0);
257
258 struct v3dv_query_pool *pool =
259 vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
260 VK_OBJECT_TYPE_QUERY_POOL);
261 if (pool == NULL)
262 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
263
264 pool->query_type = pCreateInfo->queryType;
265 pool->query_count = pCreateInfo->queryCount;
266
267 uint32_t query_idx = 0;
268 VkResult result;
269
270 const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
271 pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
272 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
273 if (pool->queries == NULL) {
274 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
275 goto fail;
276 }
277
278 switch (pool->query_type) {
279 case VK_QUERY_TYPE_OCCLUSION: {
280 /* The hardware allows us to setup groups of 16 queries in consecutive
281 * 4-byte addresses, requiring only that each group of 16 queries is
282 * aligned to a 1024 byte boundary.
283 */
284 const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
285 uint32_t bo_size = query_groups * 1024;
286 /* After the counters we store avalability data, 1 byte/query */
287 pool->occlusion.avail_offset = bo_size;
288 bo_size += pool->query_count;
289 pool->occlusion.bo = v3dv_bo_alloc(device, bo_size, "query:o", true);
290 if (!pool->occlusion.bo) {
291 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
292 goto fail;
293 }
294 if (!v3dv_bo_map(device, pool->occlusion.bo, bo_size)) {
295 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
296 goto fail;
297 }
298 break;
299 }
300 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
301 const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
302 vk_find_struct_const(pCreateInfo->pNext,
303 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
304
305 assert(pq_info);
306
307 pool->perfmon.ncounters = pq_info->counterIndexCount;
308 for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
309 pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
310
311 pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
312 DRM_V3D_MAX_PERF_COUNTERS);
313
314 assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
315 break;
316 }
317 case VK_QUERY_TYPE_TIMESTAMP: {
318 /* 8 bytes per query used for the timestamp value. We have all
319 * timestamps tightly packed first in the buffer.
320 */
321 const uint32_t bo_size = pool->query_count * 8;
322 pool->timestamp.bo = v3dv_bo_alloc(device, bo_size, "query:t", true);
323 if (!pool->timestamp.bo) {
324 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
325 goto fail;
326 }
327 if (!v3dv_bo_map(device, pool->timestamp.bo, bo_size)) {
328 result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
329 goto fail;
330 }
331 break;
332 }
333 default:
334 unreachable("Unsupported query type");
335 }
336
337 /* Initialize queries in the pool */
338 for (; query_idx < pool->query_count; query_idx++) {
339 pool->queries[query_idx].maybe_available = false;
340 switch (pool->query_type) {
341 case VK_QUERY_TYPE_OCCLUSION: {
342 const uint32_t query_group = query_idx / 16;
343 const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
344 pool->queries[query_idx].occlusion.offset = query_offset;
345 break;
346 }
347 case VK_QUERY_TYPE_TIMESTAMP:
348 pool->queries[query_idx].timestamp.offset = query_idx * 8;
349 result = vk_sync_create(&device->vk,
350 &device->pdevice->drm_syncobj_type, 0, 0,
351 &pool->queries[query_idx].timestamp.sync);
352 if (result != VK_SUCCESS)
353 goto fail;
354 break;
355 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
356 result = vk_sync_create(&device->vk,
357 &device->pdevice->drm_syncobj_type, 0, 0,
358 &pool->queries[query_idx].perf.last_job_sync);
359 if (result != VK_SUCCESS)
360 goto fail;
361
362 kperfmon_create(device, pool, query_idx);
363 break;
364 }
365 default:
366 unreachable("Unsupported query type");
367 }
368 }
369
370 /* Create meta resources */
371 result = pool_create_meta_resources(device, pool);
372 if (result != VK_SUCCESS)
373 goto fail;
374
375 *pQueryPool = v3dv_query_pool_to_handle(pool);
376
377 return VK_SUCCESS;
378
379 fail:
380 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
381 for (uint32_t j = 0; j < query_idx; j++)
382 vk_sync_destroy(&device->vk, pool->queries[j].timestamp.sync);
383 }
384
385 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
386 for (uint32_t j = 0; j < query_idx; j++)
387 vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
388 }
389
390 if (pool->occlusion.bo)
391 v3dv_bo_free(device, pool->occlusion.bo);
392 if (pool->timestamp.bo)
393 v3dv_bo_free(device, pool->timestamp.bo);
394 if (pool->queries)
395 vk_free2(&device->vk.alloc, pAllocator, pool->queries);
396 pool_destroy_meta_resources(device, pool);
397 vk_object_free(&device->vk, pAllocator, pool);
398
399 return result;
400 }
401
402 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyQueryPool(VkDevice _device,VkQueryPool queryPool,const VkAllocationCallbacks * pAllocator)403 v3dv_DestroyQueryPool(VkDevice _device,
404 VkQueryPool queryPool,
405 const VkAllocationCallbacks *pAllocator)
406 {
407 V3DV_FROM_HANDLE(v3dv_device, device, _device);
408 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
409
410 if (!pool)
411 return;
412
413 if (pool->occlusion.bo)
414 v3dv_bo_free(device, pool->occlusion.bo);
415
416 if (pool->timestamp.bo)
417 v3dv_bo_free(device, pool->timestamp.bo);
418
419 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
420 for (uint32_t i = 0; i < pool->query_count; i++)
421 vk_sync_destroy(&device->vk, pool->queries[i].timestamp.sync);
422 }
423
424 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
425 for (uint32_t i = 0; i < pool->query_count; i++) {
426 kperfmon_destroy(device, pool, i);
427 vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
428 }
429 }
430
431 if (pool->queries)
432 vk_free2(&device->vk.alloc, pAllocator, pool->queries);
433
434 pool_destroy_meta_resources(device, pool);
435
436 vk_object_free(&device->vk, pAllocator, pool);
437 }
438
439 static void
write_to_buffer(void * dst,uint32_t idx,bool do_64bit,uint64_t value)440 write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
441 {
442 if (do_64bit) {
443 uint64_t *dst64 = (uint64_t *) dst;
444 dst64[idx] = value;
445 } else {
446 uint32_t *dst32 = (uint32_t *) dst;
447 dst32[idx] = (uint32_t) value;
448 }
449 }
450
451 static VkResult
query_wait_available(struct v3dv_device * device,struct v3dv_query_pool * pool,struct v3dv_query * q,uint32_t query_idx)452 query_wait_available(struct v3dv_device *device,
453 struct v3dv_query_pool *pool,
454 struct v3dv_query *q,
455 uint32_t query_idx)
456 {
457 /* For occlusion queries we prefer to poll the availability BO in a loop
458 * to waiting on the query results BO, because the latter would
459 * make us wait for any job running queries from the pool, even if those
460 * queries do not involve the one we want to wait on.
461 */
462 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
463 uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
464 pool->occlusion.avail_offset + query_idx;
465 while (*q_addr == 0)
466 usleep(250);
467 return VK_SUCCESS;
468 }
469
470 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
471 if (vk_sync_wait(&device->vk, q->timestamp.sync,
472 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
473 return vk_device_set_lost(&device->vk, "Query job wait failed");
474 }
475 return VK_SUCCESS;
476 }
477
478 assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
479
480 /* For performance queries we need to wait for the queue to signal that
481 * the query has been submitted for execution before anything else.
482 */
483 VkResult result = VK_SUCCESS;
484 if (!q->maybe_available) {
485 struct timespec timeout;
486 timespec_get(&timeout, TIME_UTC);
487 timespec_add_msec(&timeout, &timeout, 2000);
488
489 mtx_lock(&device->query_mutex);
490 while (!q->maybe_available) {
491 if (vk_device_is_lost(&device->vk)) {
492 result = VK_ERROR_DEVICE_LOST;
493 break;
494 }
495
496 int ret = cnd_timedwait(&device->query_ended,
497 &device->query_mutex,
498 &timeout);
499 if (ret != thrd_success) {
500 mtx_unlock(&device->query_mutex);
501 result = vk_device_set_lost(&device->vk, "Query wait failed");
502 break;
503 }
504 }
505 mtx_unlock(&device->query_mutex);
506
507 if (result != VK_SUCCESS)
508 return result;
509
510 /* For performance queries, we also need to wait for the relevant syncobj
511 * to be signaled to ensure completion of the GPU work.
512 */
513 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
514 vk_sync_wait(&device->vk, q->perf.last_job_sync,
515 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
516 return vk_device_set_lost(&device->vk, "Query job wait failed");
517 }
518 }
519
520 return result;
521 }
522
523 static VkResult
query_check_available(struct v3dv_device * device,struct v3dv_query_pool * pool,struct v3dv_query * q,uint32_t query_idx)524 query_check_available(struct v3dv_device *device,
525 struct v3dv_query_pool *pool,
526 struct v3dv_query *q,
527 uint32_t query_idx)
528 {
529 /* For occlusion we check the availability BO */
530 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
531 const uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
532 pool->occlusion.avail_offset + query_idx;
533 return (*q_addr != 0) ? VK_SUCCESS : VK_NOT_READY;
534 }
535
536 /* For timestamp queries, we need to check if the relevant job
537 * has completed.
538 */
539 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
540 if (vk_sync_wait(&device->vk, q->timestamp.sync,
541 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
542 return VK_NOT_READY;
543 }
544 return VK_SUCCESS;
545 }
546
547 /* For other queries we need to check if the queue has submitted the query
548 * for execution at all.
549 */
550 assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
551 if (!q->maybe_available)
552 return VK_NOT_READY;
553
554 /* For performance queries, we also need to check if the relevant GPU job
555 * has completed.
556 */
557 if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
558 vk_sync_wait(&device->vk, q->perf.last_job_sync,
559 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
560 return VK_NOT_READY;
561 }
562
563 return VK_SUCCESS;
564 }
565
566 static VkResult
query_is_available(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_wait,bool * available)567 query_is_available(struct v3dv_device *device,
568 struct v3dv_query_pool *pool,
569 uint32_t query,
570 bool do_wait,
571 bool *available)
572 {
573 struct v3dv_query *q = &pool->queries[query];
574
575 if (do_wait) {
576 VkResult result = query_wait_available(device, pool, q, query);
577 if (result != VK_SUCCESS) {
578 *available = false;
579 return result;
580 }
581
582 *available = true;
583 } else {
584 VkResult result = query_check_available(device, pool, q, query);
585 assert(result == VK_SUCCESS || result == VK_NOT_READY);
586 *available = (result == VK_SUCCESS);
587 }
588
589 return VK_SUCCESS;
590 }
591
592 static VkResult
write_occlusion_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)593 write_occlusion_query_result(struct v3dv_device *device,
594 struct v3dv_query_pool *pool,
595 uint32_t query,
596 bool do_64bit,
597 void *data,
598 uint32_t slot)
599 {
600 assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
601
602 if (vk_device_is_lost(&device->vk))
603 return VK_ERROR_DEVICE_LOST;
604
605 struct v3dv_query *q = &pool->queries[query];
606 assert(pool->occlusion.bo && pool->occlusion.bo->map);
607
608 const uint8_t *query_addr =
609 ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
610 write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
611 return VK_SUCCESS;
612 }
613
614 static VkResult
write_timestamp_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)615 write_timestamp_query_result(struct v3dv_device *device,
616 struct v3dv_query_pool *pool,
617 uint32_t query,
618 bool do_64bit,
619 void *data,
620 uint32_t slot)
621 {
622 assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
623
624 struct v3dv_query *q = &pool->queries[query];
625
626 const uint8_t *query_addr =
627 ((uint8_t *) pool->timestamp.bo->map) + q->timestamp.offset;
628
629 write_to_buffer(data, slot, do_64bit, *((uint64_t *)query_addr));
630 return VK_SUCCESS;
631 }
632
633 static VkResult
write_performance_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)634 write_performance_query_result(struct v3dv_device *device,
635 struct v3dv_query_pool *pool,
636 uint32_t query,
637 bool do_64bit,
638 void *data,
639 uint32_t slot)
640 {
641 assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
642
643 struct v3dv_query *q = &pool->queries[query];
644 uint64_t counter_values[V3D_MAX_PERFCNT];
645
646 assert(pool->perfmon.nperfmons);
647 assert(pool->perfmon.ncounters);
648
649 for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
650 struct drm_v3d_perfmon_get_values req = {
651 .id = q->perf.kperfmon_ids[i],
652 .values_ptr = (uintptr_t)(&counter_values[i *
653 DRM_V3D_MAX_PERF_COUNTERS])
654 };
655
656 int ret = v3d_ioctl(device->pdevice->render_fd,
657 DRM_IOCTL_V3D_PERFMON_GET_VALUES,
658 &req);
659
660 if (ret) {
661 mesa_loge("failed to get perfmon values: %s\n", strerror(errno));
662 return vk_error(device, VK_ERROR_DEVICE_LOST);
663 }
664 }
665
666 for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
667 write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
668
669 return VK_SUCCESS;
670 }
671
672 static VkResult
write_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)673 write_query_result(struct v3dv_device *device,
674 struct v3dv_query_pool *pool,
675 uint32_t query,
676 bool do_64bit,
677 void *data,
678 uint32_t slot)
679 {
680 switch (pool->query_type) {
681 case VK_QUERY_TYPE_OCCLUSION:
682 return write_occlusion_query_result(device, pool, query, do_64bit,
683 data, slot);
684 case VK_QUERY_TYPE_TIMESTAMP:
685 return write_timestamp_query_result(device, pool, query, do_64bit,
686 data, slot);
687 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
688 return write_performance_query_result(device, pool, query, do_64bit,
689 data, slot);
690 default:
691 unreachable("Unsupported query type");
692 }
693 }
694
695 static uint32_t
get_query_result_count(struct v3dv_query_pool * pool)696 get_query_result_count(struct v3dv_query_pool *pool)
697 {
698 switch (pool->query_type) {
699 case VK_QUERY_TYPE_OCCLUSION:
700 case VK_QUERY_TYPE_TIMESTAMP:
701 return 1;
702 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
703 return pool->perfmon.ncounters;
704 default:
705 unreachable("Unsupported query type");
706 }
707 }
708
709 VkResult
v3dv_get_query_pool_results_cpu(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,void * data,VkDeviceSize stride,VkQueryResultFlags flags)710 v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
711 struct v3dv_query_pool *pool,
712 uint32_t first,
713 uint32_t count,
714 void *data,
715 VkDeviceSize stride,
716 VkQueryResultFlags flags)
717 {
718 assert(first < pool->query_count);
719 assert(first + count <= pool->query_count);
720 assert(data);
721
722 const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
723 pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
724 const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
725 const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
726
727 uint32_t result_count = get_query_result_count(pool);
728
729 VkResult result = VK_SUCCESS;
730 for (uint32_t i = first; i < first + count; i++) {
731 bool available = false;
732 VkResult query_result =
733 query_is_available(device, pool, i, do_wait, &available);
734 if (query_result == VK_ERROR_DEVICE_LOST)
735 result = VK_ERROR_DEVICE_LOST;
736
737 /**
738 * From the Vulkan 1.0 spec:
739 *
740 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
741 * both not set then no result values are written to pData for queries
742 * that are in the unavailable state at the time of the call, and
743 * vkGetQueryPoolResults returns VK_NOT_READY. However, availability
744 * state is still written to pData for those queries if
745 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
746 */
747 uint32_t slot = 0;
748
749 const bool write_result = available || do_partial;
750 if (write_result)
751 write_query_result(device, pool, i, do_64bit, data, slot);
752 slot += result_count;
753
754 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
755 write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
756
757 if (!write_result && result != VK_ERROR_DEVICE_LOST)
758 result = VK_NOT_READY;
759
760 data += stride;
761 }
762
763 return result;
764 }
765
766 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)767 v3dv_GetQueryPoolResults(VkDevice _device,
768 VkQueryPool queryPool,
769 uint32_t firstQuery,
770 uint32_t queryCount,
771 size_t dataSize,
772 void *pData,
773 VkDeviceSize stride,
774 VkQueryResultFlags flags)
775 {
776 V3DV_FROM_HANDLE(v3dv_device, device, _device);
777 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
778
779 if (vk_device_is_lost(&device->vk))
780 return VK_ERROR_DEVICE_LOST;
781
782 return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount,
783 pData, stride, flags);
784 }
785
786 /* Emits a series of vkCmdDispatchBase calls to execute all the workgroups
787 * required to handle a number of queries considering per-dispatch limits.
788 */
789 static void
cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer * cmd_buffer,uint32_t query_count)790 cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer *cmd_buffer,
791 uint32_t query_count)
792 {
793 VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
794
795 uint32_t dispatched = 0;
796 const uint32_t max_batch_size = 65535;
797 while (dispatched < query_count) {
798 uint32_t batch_size = MIN2(query_count - dispatched, max_batch_size);
799 v3dv_CmdDispatchBase(vk_cmd_buffer, dispatched, 0, 0, batch_size, 1, 1);
800 dispatched += batch_size;
801 }
802 }
803
804 void
v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count,uint8_t availability)805 v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
806 struct v3dv_query_pool *pool,
807 uint32_t query, uint32_t count,
808 uint8_t availability)
809 {
810 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
811 pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
812
813 struct v3dv_device *device = cmd_buffer->device;
814 VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
815
816 /* We are about to emit a compute job to set query availability and we need
817 * to ensure this executes after the graphics work using the queries has
818 * completed.
819 */
820 VkMemoryBarrier2 barrier = {
821 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
822 .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
823 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
824 };
825 VkDependencyInfo barrier_info = {
826 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
827 .memoryBarrierCount = 1,
828 .pMemoryBarriers = &barrier,
829 };
830 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
831
832 /* Dispatch queries */
833 v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
834
835 v3dv_CmdBindPipeline(vk_cmd_buffer,
836 VK_PIPELINE_BIND_POINT_COMPUTE,
837 device->queries.avail_pipeline);
838
839 v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
840 VK_PIPELINE_BIND_POINT_COMPUTE,
841 device->queries.avail_pipeline_layout,
842 0, 1, &pool->meta.descriptor_set,
843 0, NULL);
844
845 struct {
846 uint32_t offset;
847 uint32_t query;
848 uint8_t availability;
849 } push_data = { pool->occlusion.avail_offset, query, availability };
850 v3dv_CmdPushConstants(vk_cmd_buffer,
851 device->queries.avail_pipeline_layout,
852 VK_SHADER_STAGE_COMPUTE_BIT,
853 0, sizeof(push_data), &push_data);
854 cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
855
856 v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
857 }
858
859 static void
cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count)860 cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
861 struct v3dv_query_pool *pool,
862 uint32_t query, uint32_t count)
863 {
864 struct v3dv_device *device = cmd_buffer->device;
865 VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
866
867 /* Ensure the GPU is done with the queries in the graphics queue before
868 * we reset in the compute queue.
869 */
870 VkMemoryBarrier2 barrier = {
871 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
872 .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
873 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
874 };
875 VkDependencyInfo barrier_info = {
876 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
877 .memoryBarrierCount = 1,
878 .pMemoryBarriers = &barrier,
879 };
880 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
881
882 /* Emit compute reset */
883 v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
884
885 v3dv_CmdBindPipeline(vk_cmd_buffer,
886 VK_PIPELINE_BIND_POINT_COMPUTE,
887 device->queries.reset_occlusion_pipeline);
888
889 v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
890 VK_PIPELINE_BIND_POINT_COMPUTE,
891 device->queries.reset_occlusion_pipeline_layout,
892 0, 1, &pool->meta.descriptor_set,
893 0, NULL);
894 struct {
895 uint32_t offset;
896 uint32_t query;
897 } push_data = { pool->occlusion.avail_offset, query };
898 v3dv_CmdPushConstants(vk_cmd_buffer,
899 device->queries.reset_occlusion_pipeline_layout,
900 VK_SHADER_STAGE_COMPUTE_BIT,
901 0, sizeof(push_data), &push_data);
902
903 cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
904
905 v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
906
907 /* Ensure future work in the graphics queue using the queries doesn't start
908 * before the reset completed.
909 */
910 barrier = (VkMemoryBarrier2) {
911 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
912 .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
913 .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT,
914 };
915 barrier_info = (VkDependencyInfo) {
916 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
917 .memoryBarrierCount = 1,
918 .pMemoryBarriers = &barrier,
919 };
920 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
921 }
922
923 static void
cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)924 cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
925 struct v3dv_query_pool *pool,
926 uint32_t first, uint32_t count)
927 {
928 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
929 cmd_buffer_emit_reset_occlusion_query_pool(cmd_buffer, pool, first, count);
930 }
931
932 static void
cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)933 cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer *cmd_buffer,
934 struct v3dv_query_pool *pool,
935 uint32_t first, uint32_t count)
936 {
937 assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION);
938
939 struct v3dv_job *job =
940 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
941 V3DV_JOB_TYPE_CPU_RESET_QUERIES,
942 cmd_buffer, -1);
943 v3dv_return_if_oom(cmd_buffer, NULL);
944 job->cpu.query_reset.pool = pool;
945 job->cpu.query_reset.first = first;
946 job->cpu.query_reset.count = count;
947 list_addtail(&job->list_link, &cmd_buffer->jobs);
948 }
949
950 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)951 v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
952 VkQueryPool queryPool,
953 uint32_t firstQuery,
954 uint32_t queryCount)
955 {
956 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
957 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
958
959 /* Resets can only happen outside a render pass instance so we should not
960 * be in the middle of job recording.
961 */
962 assert(cmd_buffer->state.pass == NULL);
963 assert(cmd_buffer->state.job == NULL);
964
965 assert(firstQuery < pool->query_count);
966 assert(firstQuery + queryCount <= pool->query_count);
967
968 /* We can reset occlusion queries in the GPU, but for other query types
969 * we emit a CPU job that will call v3dv_reset_query_pool_cpu when executed
970 * in the queue.
971 */
972 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
973 cmd_buffer_emit_reset_query_pool(cmd_buffer, pool, firstQuery, queryCount);
974 } else {
975 cmd_buffer_emit_reset_query_pool_cpu(cmd_buffer, pool,
976 firstQuery, queryCount);
977 }
978 }
979
980 /**
981 * Creates a descriptor pool so we can create a descriptors for the destination
982 * buffers of vkCmdCopyQueryResults for queries where this is implemented in
983 * the GPU.
984 */
985 static VkResult
create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)986 create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
987 {
988 /* If this is not the first pool we create one for this command buffer
989 * size it based on the size of the currently exhausted pool.
990 */
991 uint32_t descriptor_count = 32;
992 if (cmd_buffer->meta.query.dspool != VK_NULL_HANDLE) {
993 struct v3dv_descriptor_pool *exhausted_pool =
994 v3dv_descriptor_pool_from_handle(cmd_buffer->meta.query.dspool);
995 descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
996 }
997
998 /* Create the descriptor pool */
999 cmd_buffer->meta.query.dspool = VK_NULL_HANDLE;
1000 VkDescriptorPoolSize pool_size = {
1001 .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
1002 .descriptorCount = descriptor_count,
1003 };
1004 VkDescriptorPoolCreateInfo info = {
1005 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
1006 .maxSets = descriptor_count,
1007 .poolSizeCount = 1,
1008 .pPoolSizes = &pool_size,
1009 .flags = 0,
1010 };
1011 VkResult result =
1012 v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
1013 &info,
1014 &cmd_buffer->device->vk.alloc,
1015 &cmd_buffer->meta.query.dspool);
1016
1017 if (result == VK_SUCCESS) {
1018 assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
1019 const VkDescriptorPool vk_pool = cmd_buffer->meta.query.dspool;
1020
1021 v3dv_cmd_buffer_add_private_obj(
1022 cmd_buffer, (uintptr_t) vk_pool,
1023 (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
1024
1025 struct v3dv_descriptor_pool *pool =
1026 v3dv_descriptor_pool_from_handle(vk_pool);
1027 pool->is_driver_internal = true;
1028 }
1029
1030 return result;
1031 }
1032
1033 static VkResult
allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)1034 allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
1035 VkDescriptorSet *set)
1036 {
1037 /* Make sure we have a descriptor pool */
1038 VkResult result;
1039 if (cmd_buffer->meta.query.dspool == VK_NULL_HANDLE) {
1040 result = create_storage_buffer_descriptor_pool(cmd_buffer);
1041 if (result != VK_SUCCESS)
1042 return result;
1043 }
1044 assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
1045
1046 /* Allocate descriptor set */
1047 struct v3dv_device *device = cmd_buffer->device;
1048 VkDevice vk_device = v3dv_device_to_handle(device);
1049 VkDescriptorSetAllocateInfo info = {
1050 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
1051 .descriptorPool = cmd_buffer->meta.query.dspool,
1052 .descriptorSetCount = 1,
1053 .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1054 };
1055 result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
1056
1057 /* If we ran out of pool space, grow the pool and try again */
1058 if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
1059 result = create_storage_buffer_descriptor_pool(cmd_buffer);
1060 if (result == VK_SUCCESS) {
1061 info.descriptorPool = cmd_buffer->meta.query.dspool;
1062 result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
1063 }
1064 }
1065
1066 return result;
1067 }
1068
1069 static uint32_t
copy_pipeline_index_from_flags(VkQueryResultFlags flags)1070 copy_pipeline_index_from_flags(VkQueryResultFlags flags)
1071 {
1072 uint32_t index = 0;
1073 if (flags & VK_QUERY_RESULT_64_BIT)
1074 index |= 1;
1075 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
1076 index |= 2;
1077 if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
1078 index |= 4;
1079 assert(index < 8);
1080 return index;
1081 }
1082
1083 static nir_shader *
1084 get_copy_query_results_cs(const nir_shader_compiler_options *compiler_options,
1085 VkQueryResultFlags flags);
1086
1087 static void
cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * buf,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)1088 cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer *cmd_buffer,
1089 struct v3dv_query_pool *pool,
1090 uint32_t first, uint32_t count,
1091 struct v3dv_buffer *buf,
1092 uint32_t offset, uint32_t stride,
1093 VkQueryResultFlags flags)
1094 {
1095 struct v3dv_device *device = cmd_buffer->device;
1096 VkDevice vk_device = v3dv_device_to_handle(device);
1097 VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
1098
1099 /* Create the required copy pipeline if not yet created */
1100 uint32_t pipeline_idx = copy_pipeline_index_from_flags(flags);
1101 if (!device->queries.copy_pipeline[pipeline_idx]) {
1102 const nir_shader_compiler_options *compiler_options =
1103 v3dv_pipeline_get_nir_options(&device->devinfo);
1104 nir_shader *copy_query_results_cs_nir =
1105 get_copy_query_results_cs(compiler_options, flags);
1106 VkResult result =
1107 v3dv_create_compute_pipeline_from_nir(
1108 device, copy_query_results_cs_nir,
1109 device->queries.copy_pipeline_layout,
1110 &device->queries.copy_pipeline[pipeline_idx]);
1111 ralloc_free(copy_query_results_cs_nir);
1112 if (result != VK_SUCCESS) {
1113 mesa_loge("Failed to create copy query results pipeline\n");
1114 return;
1115 }
1116 }
1117
1118 /* FIXME: do we need this barrier? Since vkCmdEndQuery should've been called
1119 * and that already waits maybe we don't (since this is serialized
1120 * in the compute queue with EndQuery anyway).
1121 */
1122 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1123 VkMemoryBarrier2 barrier = {
1124 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
1125 .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
1126 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1127 };
1128 VkDependencyInfo barrier_info = {
1129 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1130 .memoryBarrierCount = 1,
1131 .pMemoryBarriers = &barrier,
1132 };
1133 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
1134 }
1135
1136 /* Allocate and setup descriptor set for output buffer */
1137 VkDescriptorSet out_buf_descriptor_set;
1138 VkResult result =
1139 allocate_storage_buffer_descriptor_set(cmd_buffer,
1140 &out_buf_descriptor_set);
1141 if (result != VK_SUCCESS) {
1142 mesa_loge("vkCmdCopyQueryPoolResults failed: "
1143 "could not allocate descriptor.\n");
1144 return;
1145 }
1146
1147 VkDescriptorBufferInfo desc_buf_info = {
1148 .buffer = v3dv_buffer_to_handle(buf),
1149 .offset = 0,
1150 .range = VK_WHOLE_SIZE,
1151 };
1152 VkWriteDescriptorSet write = {
1153 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1154 .dstSet = out_buf_descriptor_set,
1155 .dstBinding = 0,
1156 .dstArrayElement = 0,
1157 .descriptorCount = 1,
1158 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1159 .pBufferInfo = &desc_buf_info,
1160 };
1161 v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
1162
1163 /* Dispatch copy */
1164 v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
1165
1166 assert(device->queries.copy_pipeline[pipeline_idx]);
1167 v3dv_CmdBindPipeline(vk_cmd_buffer,
1168 VK_PIPELINE_BIND_POINT_COMPUTE,
1169 device->queries.copy_pipeline[pipeline_idx]);
1170
1171 VkDescriptorSet sets[2] = {
1172 pool->meta.descriptor_set,
1173 out_buf_descriptor_set,
1174 };
1175 v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
1176 VK_PIPELINE_BIND_POINT_COMPUTE,
1177 device->queries.copy_pipeline_layout,
1178 0, 2, sets, 0, NULL);
1179
1180 struct {
1181 uint32_t avail_offset, first, offset, stride, flags;
1182 } push_data = { pool->occlusion.avail_offset, first, offset, stride, flags };
1183 v3dv_CmdPushConstants(vk_cmd_buffer,
1184 device->queries.copy_pipeline_layout,
1185 VK_SHADER_STAGE_COMPUTE_BIT,
1186 0, sizeof(push_data), &push_data);
1187
1188 cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
1189
1190 v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
1191 }
1192
1193 static void
cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * dst,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)1194 cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer *cmd_buffer,
1195 struct v3dv_query_pool *pool,
1196 uint32_t first,
1197 uint32_t count,
1198 struct v3dv_buffer *dst,
1199 uint32_t offset,
1200 uint32_t stride,
1201 VkQueryResultFlags flags)
1202 {
1203 struct v3dv_job *job =
1204 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
1205 V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
1206 cmd_buffer, -1);
1207 v3dv_return_if_oom(cmd_buffer, NULL);
1208
1209 job->cpu.query_copy_results.pool = pool;
1210 job->cpu.query_copy_results.first = first;
1211 job->cpu.query_copy_results.count = count;
1212 job->cpu.query_copy_results.dst = dst;
1213 job->cpu.query_copy_results.offset = offset;
1214 job->cpu.query_copy_results.stride = stride;
1215 job->cpu.query_copy_results.flags = flags;
1216
1217 list_addtail(&job->list_link, &cmd_buffer->jobs);
1218 }
1219
1220 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1221 v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
1222 VkQueryPool queryPool,
1223 uint32_t firstQuery,
1224 uint32_t queryCount,
1225 VkBuffer dstBuffer,
1226 VkDeviceSize dstOffset,
1227 VkDeviceSize stride,
1228 VkQueryResultFlags flags)
1229 {
1230 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1231 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1232 V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);
1233
1234 /* Copies can only happen outside a render pass instance so we should not
1235 * be in the middle of job recording.
1236 */
1237 assert(cmd_buffer->state.pass == NULL);
1238 assert(cmd_buffer->state.job == NULL);
1239
1240 assert(firstQuery < pool->query_count);
1241 assert(firstQuery + queryCount <= pool->query_count);
1242
1243 /* For occlusion queries we implement the copy in the GPU but for other
1244 * queries we emit a CPU job that will call v3dv_get_query_pool_results_cpu
1245 * when executed in the queue.
1246 */
1247 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
1248 cmd_buffer_emit_copy_query_pool_results(cmd_buffer, pool,
1249 firstQuery, queryCount,
1250 dst, (uint32_t) dstOffset,
1251 (uint32_t) stride, flags);
1252 } else {
1253 cmd_buffer_emit_copy_query_pool_results_cpu(cmd_buffer, pool,
1254 firstQuery, queryCount,
1255 dst, (uint32_t)dstOffset,
1256 (uint32_t) stride, flags);
1257 }
1258 }
1259
1260 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1261 v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
1262 VkQueryPool queryPool,
1263 uint32_t query,
1264 VkQueryControlFlags flags)
1265 {
1266 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1267 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1268
1269 v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags);
1270 }
1271
1272 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1273 v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
1274 VkQueryPool queryPool,
1275 uint32_t query)
1276 {
1277 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1278 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1279
1280 v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
1281 }
1282
1283 void
v3dv_reset_query_pool_cpu(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)1284 v3dv_reset_query_pool_cpu(struct v3dv_device *device,
1285 struct v3dv_query_pool *pool,
1286 uint32_t first,
1287 uint32_t count)
1288 {
1289 mtx_lock(&device->query_mutex);
1290
1291 if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
1292 assert(first + count <= pool->query_count);
1293
1294 /* Reset timestamp */
1295 uint8_t *base_addr;
1296 base_addr = ((uint8_t *) pool->timestamp.bo->map) +
1297 pool->queries[first].timestamp.offset;
1298 memset(base_addr, 0, 8 * count);
1299
1300 for (uint32_t i = first; i < first + count; i++) {
1301 if (vk_sync_reset(&device->vk, pool->queries[i].timestamp.sync) != VK_SUCCESS)
1302 mesa_loge("Failed to reset sync");
1303 }
1304
1305 mtx_unlock(&device->query_mutex);
1306 return;
1307 }
1308
1309 for (uint32_t i = first; i < first + count; i++) {
1310 assert(i < pool->query_count);
1311 struct v3dv_query *q = &pool->queries[i];
1312 q->maybe_available = false;
1313 switch (pool->query_type) {
1314 case VK_QUERY_TYPE_OCCLUSION: {
1315 /* Reset availability */
1316 uint8_t *base_addr = ((uint8_t *) pool->occlusion.bo->map) +
1317 pool->occlusion.avail_offset + first;
1318 memset(base_addr, 0, count);
1319
1320 /* Reset occlusion counter */
1321 const uint8_t *q_addr =
1322 ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
1323 uint32_t *counter = (uint32_t *) q_addr;
1324 *counter = 0;
1325 break;
1326 }
1327 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1328 kperfmon_destroy(device, pool, i);
1329 kperfmon_create(device, pool, i);
1330 if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
1331 mesa_loge("Failed to reset sync");
1332 break;
1333 default:
1334 unreachable("Unsupported query type");
1335 }
1336 }
1337
1338 mtx_unlock(&device->query_mutex);
1339 }
1340
1341 VKAPI_ATTR void VKAPI_CALL
v3dv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1342 v3dv_ResetQueryPool(VkDevice _device,
1343 VkQueryPool queryPool,
1344 uint32_t firstQuery,
1345 uint32_t queryCount)
1346 {
1347 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1348 V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1349
1350 v3dv_reset_query_pool_cpu(device, pool, firstQuery, queryCount);
1351 }
1352
1353 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1354 v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1355 VkPhysicalDevice physicalDevice,
1356 uint32_t queueFamilyIndex,
1357 uint32_t *pCounterCount,
1358 VkPerformanceCounterKHR *pCounters,
1359 VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
1360 {
1361 V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
1362
1363 uint32_t desc_count = *pCounterCount;
1364 uint8_t ncounters = pDevice->perfcntr->max_perfcnt;
1365
1366 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
1367 out, pCounters, pCounterCount);
1368 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
1369 out_desc, pCounterDescriptions, &desc_count);
1370
1371 for (int i = 0; i < ncounters; i++) {
1372 const struct v3d_perfcntr_desc *perfcntr_desc = v3d_perfcntrs_get_by_index(pDevice->perfcntr, i);
1373
1374 vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1375 counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
1376 counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
1377 counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
1378
1379 unsigned char sha1_result[20];
1380 _mesa_sha1_compute(perfcntr_desc->name, strlen(perfcntr_desc->name), sha1_result);
1381
1382 memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1383 }
1384
1385 vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
1386 &out_desc, desc) {
1387 desc->flags = 0;
1388 snprintf(desc->name, sizeof(desc->name), "%s", perfcntr_desc->name);
1389 snprintf(desc->category, sizeof(desc->category), "%s", perfcntr_desc->category);
1390 snprintf(desc->description, sizeof(desc->description), "%s", perfcntr_desc->description);
1391 }
1392 }
1393
1394 return vk_outarray_status(&out);
1395 }
1396
1397 VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1398 v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1399 VkPhysicalDevice physicalDevice,
1400 const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
1401 uint32_t *pNumPasses)
1402 {
1403 *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
1404 DRM_V3D_MAX_PERF_COUNTERS);
1405 }
1406
1407 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AcquireProfilingLockKHR(VkDevice _device,const VkAcquireProfilingLockInfoKHR * pInfo)1408 v3dv_AcquireProfilingLockKHR(
1409 VkDevice _device,
1410 const VkAcquireProfilingLockInfoKHR *pInfo)
1411 {
1412 return VK_SUCCESS;
1413 }
1414
1415 VKAPI_ATTR void VKAPI_CALL
v3dv_ReleaseProfilingLockKHR(VkDevice device)1416 v3dv_ReleaseProfilingLockKHR(VkDevice device)
1417 {
1418 }
1419
1420 static inline void
nir_set_query_availability(nir_builder * b,nir_def * buf,nir_def * offset,nir_def * query_idx,nir_def * avail)1421 nir_set_query_availability(nir_builder *b,
1422 nir_def *buf,
1423 nir_def *offset,
1424 nir_def *query_idx,
1425 nir_def *avail)
1426 {
1427 offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
1428 nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1);
1429 }
1430
1431 static inline nir_def *
nir_get_query_availability(nir_builder * b,nir_def * buf,nir_def * offset,nir_def * query_idx)1432 nir_get_query_availability(nir_builder *b,
1433 nir_def *buf,
1434 nir_def *offset,
1435 nir_def *query_idx)
1436 {
1437 offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
1438 nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1);
1439 return nir_i2i32(b, avail);
1440 }
1441
1442 static nir_shader *
get_set_query_availability_cs(const nir_shader_compiler_options * options)1443 get_set_query_availability_cs(const nir_shader_compiler_options *options)
1444 {
1445 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1446 "set query availability cs");
1447
1448 nir_def *buf =
1449 nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1450 .desc_set = 0,
1451 .binding = 0,
1452 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1453
1454 /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1455 * ever change any of these parameters we need to update how we compute the
1456 * query index here.
1457 */
1458 nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1459
1460 nir_def *offset =
1461 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1462
1463 nir_def *query_idx =
1464 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1465
1466 nir_def *avail =
1467 nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1);
1468
1469 query_idx = nir_iadd(&b, query_idx, wg_id);
1470 nir_set_query_availability(&b, buf, offset, query_idx, avail);
1471
1472 return b.shader;
1473 }
1474
1475 static inline nir_def *
nir_get_occlusion_counter_offset(nir_builder * b,nir_def * query_idx)1476 nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx)
1477 {
1478 nir_def *query_group = nir_udiv_imm(b, query_idx, 16);
1479 nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16);
1480 nir_def *offset =
1481 nir_iadd(b, nir_imul_imm(b, query_group, 1024),
1482 nir_imul_imm(b, query_group_offset, 4));
1483 return offset;
1484 }
1485
1486 static inline void
nir_reset_occlusion_counter(nir_builder * b,nir_def * buf,nir_def * query_idx)1487 nir_reset_occlusion_counter(nir_builder *b,
1488 nir_def *buf,
1489 nir_def *query_idx)
1490 {
1491 nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
1492 nir_def *zero = nir_imm_int(b, 0);
1493 nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4);
1494 }
1495
1496 static inline nir_def *
nir_read_occlusion_counter(nir_builder * b,nir_def * buf,nir_def * query_idx)1497 nir_read_occlusion_counter(nir_builder *b,
1498 nir_def *buf,
1499 nir_def *query_idx)
1500 {
1501 nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
1502 return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4);
1503 }
1504
1505 static nir_shader *
get_reset_occlusion_query_cs(const nir_shader_compiler_options * options)1506 get_reset_occlusion_query_cs(const nir_shader_compiler_options *options)
1507 {
1508 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1509 "reset occlusion query cs");
1510
1511 nir_def *buf =
1512 nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1513 .desc_set = 0,
1514 .binding = 0,
1515 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1516
1517 /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1518 * ever change any of these parameters we need to update how we compute the
1519 * query index here.
1520 */
1521 nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1522
1523 nir_def *avail_offset =
1524 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1525
1526 nir_def *base_query_idx =
1527 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1528
1529 nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
1530
1531 nir_set_query_availability(&b, buf, avail_offset, query_idx,
1532 nir_imm_intN_t(&b, 0, 8));
1533 nir_reset_occlusion_counter(&b, buf, query_idx);
1534
1535 return b.shader;
1536 }
1537
1538 static void
write_query_buffer(nir_builder * b,nir_def * buf,nir_def ** offset,nir_def * value,bool flag_64bit)1539 write_query_buffer(nir_builder *b,
1540 nir_def *buf,
1541 nir_def **offset,
1542 nir_def *value,
1543 bool flag_64bit)
1544 {
1545 if (flag_64bit) {
1546 /* Create a 64-bit value using a vec2 with the .Y component set to 0
1547 * so we can write a 64-bit value in a single store.
1548 */
1549 nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0));
1550 nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8);
1551 *offset = nir_iadd_imm(b, *offset, 8);
1552 } else {
1553 nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4);
1554 *offset = nir_iadd_imm(b, *offset, 4);
1555 }
1556 }
1557
1558 static nir_shader *
get_copy_query_results_cs(const nir_shader_compiler_options * options,VkQueryResultFlags flags)1559 get_copy_query_results_cs(const nir_shader_compiler_options *options,
1560 VkQueryResultFlags flags)
1561 {
1562 bool flag_64bit = flags & VK_QUERY_RESULT_64_BIT;
1563 bool flag_avail = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
1564 bool flag_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
1565
1566 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1567 "copy query results cs");
1568
1569 nir_def *buf =
1570 nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1571 .desc_set = 0,
1572 .binding = 0,
1573 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1574
1575 nir_def *buf_out =
1576 nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1577 .desc_set = 1,
1578 .binding = 0,
1579 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1580
1581 /* Read push constants */
1582 nir_def *avail_offset =
1583 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1584
1585 nir_def *base_query_idx =
1586 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1587
1588 nir_def *base_offset_out =
1589 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4);
1590
1591 nir_def *stride =
1592 nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4);
1593
1594 /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1595 * ever change any of these parameters we need to update how we compute the
1596 * query index here.
1597 */
1598 nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1599 nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
1600
1601 /* Read query availability if needed */
1602 nir_def *avail = NULL;
1603 if (flag_avail || !flag_partial)
1604 avail = nir_get_query_availability(&b, buf, avail_offset, query_idx);
1605
1606 /* Write occusion query result... */
1607 nir_def *offset =
1608 nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride));
1609
1610 /* ...if partial is requested, we always write */
1611 if(flag_partial) {
1612 nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
1613 write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
1614 } else {
1615 /*...otherwise, we only write if the query is available */
1616 nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0));
1617 nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
1618 write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
1619 nir_pop_if(&b, if_stmt);
1620 }
1621
1622 /* Write query availability */
1623 if (flag_avail)
1624 write_query_buffer(&b, buf_out, &offset, avail, flag_64bit);
1625
1626 return b.shader;
1627 }
1628
1629 static bool
create_query_pipelines(struct v3dv_device * device)1630 create_query_pipelines(struct v3dv_device *device)
1631 {
1632 VkResult result;
1633 VkPipeline pipeline;
1634
1635 /* Set layout: single storage buffer */
1636 if (!device->queries.buf_descriptor_set_layout) {
1637 VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
1638 .binding = 0,
1639 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1640 .descriptorCount = 1,
1641 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1642 };
1643 VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
1644 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
1645 .bindingCount = 1,
1646 .pBindings = &descriptor_set_layout_binding,
1647 };
1648 result =
1649 v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
1650 &descriptor_set_layout_info,
1651 &device->vk.alloc,
1652 &device->queries.buf_descriptor_set_layout);
1653 if (result != VK_SUCCESS)
1654 return false;
1655 }
1656
1657 /* Set availability pipeline.
1658 *
1659 * Pipeline layout:
1660 * - 1 storage buffer for the BO with the query availability.
1661 * - 2 push constants:
1662 * 0B: offset of the availability info in the buffer (4 bytes)
1663 * 4B: base query index (4 bytes).
1664 * 8B: availability (1 byte).
1665 */
1666 if (!device->queries.avail_pipeline_layout) {
1667 VkPipelineLayoutCreateInfo pipeline_layout_info = {
1668 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1669 .setLayoutCount = 1,
1670 .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1671 .pushConstantRangeCount = 1,
1672 .pPushConstantRanges =
1673 &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 9 },
1674 };
1675
1676 result =
1677 v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1678 &pipeline_layout_info,
1679 &device->vk.alloc,
1680 &device->queries.avail_pipeline_layout);
1681
1682 if (result != VK_SUCCESS)
1683 return false;
1684 }
1685
1686 const nir_shader_compiler_options *compiler_options =
1687 v3dv_pipeline_get_nir_options(&device->devinfo);
1688
1689 if (!device->queries.avail_pipeline) {
1690 nir_shader *set_query_availability_cs_nir =
1691 get_set_query_availability_cs(compiler_options);
1692 result = v3dv_create_compute_pipeline_from_nir(device,
1693 set_query_availability_cs_nir,
1694 device->queries.avail_pipeline_layout,
1695 &pipeline);
1696 ralloc_free(set_query_availability_cs_nir);
1697 if (result != VK_SUCCESS)
1698 return false;
1699
1700 device->queries.avail_pipeline = pipeline;
1701 }
1702
1703 /* Reset occlusion query pipeline.
1704 *
1705 * Pipeline layout:
1706 * - 1 storage buffer for the BO with the occlusion and availability data.
1707 * - Push constants:
1708 * 0B: offset of the availability info in the buffer (4B)
1709 * 4B: base query index (4B)
1710 */
1711 if (!device->queries.reset_occlusion_pipeline_layout) {
1712 VkPipelineLayoutCreateInfo pipeline_layout_info = {
1713 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1714 .setLayoutCount = 1,
1715 .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1716 .pushConstantRangeCount = 1,
1717 .pPushConstantRanges =
1718 &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 8 },
1719 };
1720
1721 result =
1722 v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1723 &pipeline_layout_info,
1724 &device->vk.alloc,
1725 &device->queries.reset_occlusion_pipeline_layout);
1726
1727 if (result != VK_SUCCESS)
1728 return false;
1729 }
1730
1731 if (!device->queries.reset_occlusion_pipeline) {
1732 nir_shader *reset_occlusion_query_cs_nir =
1733 get_reset_occlusion_query_cs(compiler_options);
1734 result = v3dv_create_compute_pipeline_from_nir(
1735 device,
1736 reset_occlusion_query_cs_nir,
1737 device->queries.reset_occlusion_pipeline_layout,
1738 &pipeline);
1739 ralloc_free(reset_occlusion_query_cs_nir);
1740 if (result != VK_SUCCESS)
1741 return false;
1742
1743 device->queries.reset_occlusion_pipeline = pipeline;
1744 }
1745
1746 /* Copy query results pipelines.
1747 *
1748 * Pipeline layout:
1749 * - 1 storage buffer for the BO with the query availability and occlusion.
1750 * - 1 storage buffer for the output.
1751 * - Push constants:
1752 * 0B: offset of the availability info in the buffer (4B)
1753 * 4B: base query index (4B)
1754 * 8B: offset into output buffer (4B)
1755 * 12B: stride (4B)
1756 *
1757 * We create multiple specialized pipelines depending on the copy flags
1758 * to remove conditionals from the copy shader and get more optimized
1759 * pipelines.
1760 */
1761 if (!device->queries.copy_pipeline_layout) {
1762 VkDescriptorSetLayout set_layouts[2] = {
1763 device->queries.buf_descriptor_set_layout,
1764 device->queries.buf_descriptor_set_layout
1765 };
1766 VkPipelineLayoutCreateInfo pipeline_layout_info = {
1767 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1768 .setLayoutCount = 2,
1769 .pSetLayouts = set_layouts,
1770 .pushConstantRangeCount = 1,
1771 .pPushConstantRanges =
1772 &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 16 },
1773 };
1774
1775 result =
1776 v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1777 &pipeline_layout_info,
1778 &device->vk.alloc,
1779 &device->queries.copy_pipeline_layout);
1780
1781 if (result != VK_SUCCESS)
1782 return false;
1783 }
1784
1785 /* Actual copy pipelines are created lazily on demand since there can be up
1786 * to 8 depending on the flags used, however it is likely that applications
1787 * will use the same flags every time and only one pipeline is required.
1788 */
1789
1790 return true;
1791 }
1792
1793 static void
destroy_query_pipelines(struct v3dv_device * device)1794 destroy_query_pipelines(struct v3dv_device *device)
1795 {
1796 VkDevice _device = v3dv_device_to_handle(device);
1797
1798 /* Availability pipeline */
1799 v3dv_DestroyPipeline(_device, device->queries.avail_pipeline,
1800 &device->vk.alloc);
1801 device->queries.avail_pipeline = VK_NULL_HANDLE;
1802 v3dv_DestroyPipelineLayout(_device, device->queries.avail_pipeline_layout,
1803 &device->vk.alloc);
1804 device->queries.avail_pipeline_layout = VK_NULL_HANDLE;
1805
1806 /* Reset occlusion pipeline */
1807 v3dv_DestroyPipeline(_device, device->queries.reset_occlusion_pipeline,
1808 &device->vk.alloc);
1809 device->queries.reset_occlusion_pipeline = VK_NULL_HANDLE;
1810 v3dv_DestroyPipelineLayout(_device,
1811 device->queries.reset_occlusion_pipeline_layout,
1812 &device->vk.alloc);
1813 device->queries.reset_occlusion_pipeline_layout = VK_NULL_HANDLE;
1814
1815 /* Copy pipelines */
1816 for (int i = 0; i < 8; i++) {
1817 v3dv_DestroyPipeline(_device, device->queries.copy_pipeline[i],
1818 &device->vk.alloc);
1819 device->queries.copy_pipeline[i] = VK_NULL_HANDLE;
1820 }
1821 v3dv_DestroyPipelineLayout(_device, device->queries.copy_pipeline_layout,
1822 &device->vk.alloc);
1823 device->queries.copy_pipeline_layout = VK_NULL_HANDLE;
1824
1825 v3dv_DestroyDescriptorSetLayout(_device,
1826 device->queries.buf_descriptor_set_layout,
1827 &device->vk.alloc);
1828 device->queries.buf_descriptor_set_layout = VK_NULL_HANDLE;
1829 }
1830
1831 /**
1832 * Allocates device resources for implementing certain types of queries.
1833 */
1834 VkResult
v3dv_query_allocate_resources(struct v3dv_device * device)1835 v3dv_query_allocate_resources(struct v3dv_device *device)
1836 {
1837 if (!create_query_pipelines(device))
1838 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1839
1840 return VK_SUCCESS;
1841 }
1842
1843 void
v3dv_query_free_resources(struct v3dv_device * device)1844 v3dv_query_free_resources(struct v3dv_device *device)
1845 {
1846 destroy_query_pipelines(device);
1847 }
1848