• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 
26 #include "util/timespec.h"
27 #include "compiler/nir/nir_builder.h"
28 
29 static void
kperfmon_create(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query)30 kperfmon_create(struct v3dv_device *device,
31                 struct v3dv_query_pool *pool,
32                 uint32_t query)
33 {
34    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
35       assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
36 
37       struct drm_v3d_perfmon_create req = {
38          .ncounters = MIN2(pool->perfmon.ncounters -
39                            i * DRM_V3D_MAX_PERF_COUNTERS,
40                            DRM_V3D_MAX_PERF_COUNTERS),
41       };
42       memcpy(req.counters,
43              &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
44              req.ncounters);
45 
46       int ret = v3d_ioctl(device->pdevice->render_fd,
47                           DRM_IOCTL_V3D_PERFMON_CREATE,
48                           &req);
49       if (ret)
50          mesa_loge("Failed to create perfmon for query %d: %s\n", query,
51                    strerror(errno));
52 
53       pool->queries[query].perf.kperfmon_ids[i] = req.id;
54    }
55 }
56 
57 static void
kperfmon_destroy(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query)58 kperfmon_destroy(struct v3dv_device *device,
59                  struct v3dv_query_pool *pool,
60                  uint32_t query)
61 {
62    /* Skip destroying if never created */
63    if (!pool->queries[query].perf.kperfmon_ids[0])
64       return;
65 
66    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
67       struct drm_v3d_perfmon_destroy req = {
68          .id = pool->queries[query].perf.kperfmon_ids[i]
69       };
70 
71       int ret = v3d_ioctl(device->pdevice->render_fd,
72                           DRM_IOCTL_V3D_PERFMON_DESTROY,
73                           &req);
74 
75       if (ret) {
76          mesa_loge("Failed to destroy perfmon %u: %s\n",
77                    req.id, strerror(errno));
78       }
79    }
80 }
81 
82 /**
83  * Creates a VkBuffer (and VkDeviceMemory) to access a BO.
84  */
85 static VkResult
create_vk_storage_buffer(struct v3dv_device * device,struct v3dv_bo * bo,VkBuffer * vk_buf,VkDeviceMemory * vk_mem)86 create_vk_storage_buffer(struct v3dv_device *device,
87                          struct v3dv_bo *bo,
88                          VkBuffer *vk_buf,
89                          VkDeviceMemory *vk_mem)
90 {
91    VkDevice vk_device = v3dv_device_to_handle(device);
92 
93    VkBufferCreateInfo buf_info = {
94       .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
95       .size = bo->size,
96       .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
97    };
98    VkResult result = v3dv_CreateBuffer(vk_device, &buf_info, NULL, vk_buf);
99    if (result != VK_SUCCESS)
100       return result;
101 
102    struct v3dv_device_memory *mem =
103       vk_object_zalloc(&device->vk, NULL, sizeof(*mem),
104                        VK_OBJECT_TYPE_DEVICE_MEMORY);
105    if (!mem)
106       return VK_ERROR_OUT_OF_HOST_MEMORY;
107 
108    mem->bo = bo;
109    mem->type = &device->pdevice->memory.memoryTypes[0];
110 
111    *vk_mem = v3dv_device_memory_to_handle(mem);
112    VkBindBufferMemoryInfo bind_info = {
113       .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
114       .buffer = *vk_buf,
115       .memory = *vk_mem,
116       .memoryOffset = 0,
117    };
118    v3dv_BindBufferMemory2(vk_device, 1, &bind_info);
119 
120    return VK_SUCCESS;
121 }
122 
123 static void
destroy_vk_storage_buffer(struct v3dv_device * device,VkBuffer * vk_buf,VkDeviceMemory * vk_mem)124 destroy_vk_storage_buffer(struct v3dv_device *device,
125                           VkBuffer *vk_buf,
126                           VkDeviceMemory *vk_mem)
127 {
128    if (*vk_mem) {
129       vk_object_free(&device->vk, NULL, v3dv_device_memory_from_handle(*vk_mem));
130       *vk_mem = VK_NULL_HANDLE;
131    }
132 
133    v3dv_DestroyBuffer(v3dv_device_to_handle(device), *vk_buf, NULL);
134    *vk_buf = VK_NULL_HANDLE;
135 }
136 
137 /**
138  * Allocates descriptor sets to access query pool BO (availability and
139  * occlusion query results) from Vulkan pipelines.
140  */
141 static VkResult
create_pool_descriptors(struct v3dv_device * device,struct v3dv_query_pool * pool)142 create_pool_descriptors(struct v3dv_device *device,
143                         struct v3dv_query_pool *pool)
144 {
145    assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
146    VkDevice vk_device = v3dv_device_to_handle(device);
147 
148    VkDescriptorPoolSize pool_size = {
149       .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
150       .descriptorCount = 1,
151    };
152    VkDescriptorPoolCreateInfo pool_info = {
153       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
154       .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
155       .maxSets = 1,
156       .poolSizeCount = 1,
157       .pPoolSizes = &pool_size,
158    };
159    VkResult result =
160       v3dv_CreateDescriptorPool(vk_device, &pool_info, NULL,
161                                 &pool->meta.descriptor_pool);
162 
163    if (result != VK_SUCCESS)
164       return result;
165 
166    VkDescriptorSetAllocateInfo alloc_info = {
167       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
168       .descriptorPool = pool->meta.descriptor_pool,
169       .descriptorSetCount = 1,
170       .pSetLayouts = &device->queries.buf_descriptor_set_layout,
171    };
172    result = v3dv_AllocateDescriptorSets(vk_device, &alloc_info,
173                                         &pool->meta.descriptor_set);
174    if (result != VK_SUCCESS)
175       return result;
176 
177    VkDescriptorBufferInfo desc_buf_info = {
178       .buffer = pool->meta.buf,
179       .offset = 0,
180       .range = VK_WHOLE_SIZE,
181    };
182 
183    VkWriteDescriptorSet write = {
184       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
185       .dstSet = pool->meta.descriptor_set,
186       .dstBinding = 0,
187       .dstArrayElement = 0,
188       .descriptorCount = 1,
189       .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
190       .pBufferInfo = &desc_buf_info,
191    };
192    v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
193 
194    return VK_SUCCESS;
195 }
196 
197 static void
destroy_pool_descriptors(struct v3dv_device * device,struct v3dv_query_pool * pool)198 destroy_pool_descriptors(struct v3dv_device *device,
199                          struct v3dv_query_pool *pool)
200 {
201    assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
202 
203    v3dv_FreeDescriptorSets(v3dv_device_to_handle(device),
204                            pool->meta.descriptor_pool,
205                            1, &pool->meta.descriptor_set);
206    pool->meta.descriptor_set = VK_NULL_HANDLE;
207 
208    v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device),
209                               pool->meta.descriptor_pool, NULL);
210    pool->meta.descriptor_pool = VK_NULL_HANDLE;
211 }
212 
213 static VkResult
pool_create_meta_resources(struct v3dv_device * device,struct v3dv_query_pool * pool)214 pool_create_meta_resources(struct v3dv_device *device,
215                            struct v3dv_query_pool *pool)
216 {
217    VkResult result;
218 
219    if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
220       return VK_SUCCESS;
221 
222    result = create_vk_storage_buffer(device, pool->occlusion.bo,
223                                      &pool->meta.buf, &pool->meta.mem);
224    if (result != VK_SUCCESS)
225       return result;
226 
227    result = create_pool_descriptors(device, pool);
228    if (result != VK_SUCCESS)
229        return result;
230 
231    return VK_SUCCESS;
232 }
233 
234 static void
pool_destroy_meta_resources(struct v3dv_device * device,struct v3dv_query_pool * pool)235 pool_destroy_meta_resources(struct v3dv_device *device,
236                             struct v3dv_query_pool *pool)
237 {
238    if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
239       return;
240 
241    destroy_pool_descriptors(device, pool);
242    destroy_vk_storage_buffer(device, &pool->meta.buf, &pool->meta.mem);
243 }
244 
245 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)246 v3dv_CreateQueryPool(VkDevice _device,
247                      const VkQueryPoolCreateInfo *pCreateInfo,
248                      const VkAllocationCallbacks *pAllocator,
249                      VkQueryPool *pQueryPool)
250 {
251    V3DV_FROM_HANDLE(v3dv_device, device, _device);
252 
253    assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
254           pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
255           pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
256    assert(pCreateInfo->queryCount > 0);
257 
258    struct v3dv_query_pool *pool =
259       vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
260                        VK_OBJECT_TYPE_QUERY_POOL);
261    if (pool == NULL)
262       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
263 
264    pool->query_type = pCreateInfo->queryType;
265    pool->query_count = pCreateInfo->queryCount;
266 
267    uint32_t query_idx = 0;
268    VkResult result;
269 
270    const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
271    pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
272                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
273    if (pool->queries == NULL) {
274       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
275       goto fail;
276    }
277 
278    switch (pool->query_type) {
279    case VK_QUERY_TYPE_OCCLUSION: {
280       /* The hardware allows us to setup groups of 16 queries in consecutive
281        * 4-byte addresses, requiring only that each group of 16 queries is
282        * aligned to a 1024 byte boundary.
283        */
284       const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
285       uint32_t bo_size = query_groups * 1024;
286       /* After the counters we store avalability data, 1 byte/query */
287       pool->occlusion.avail_offset = bo_size;
288       bo_size += pool->query_count;
289       pool->occlusion.bo = v3dv_bo_alloc(device, bo_size, "query:o", true);
290       if (!pool->occlusion.bo) {
291          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
292          goto fail;
293       }
294       if (!v3dv_bo_map(device, pool->occlusion.bo, bo_size)) {
295          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
296          goto fail;
297       }
298       break;
299    }
300    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
301       const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
302          vk_find_struct_const(pCreateInfo->pNext,
303                               QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
304 
305       assert(pq_info);
306 
307       pool->perfmon.ncounters = pq_info->counterIndexCount;
308       for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
309          pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
310 
311       pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
312                                              DRM_V3D_MAX_PERF_COUNTERS);
313 
314       assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
315       break;
316    }
317    case VK_QUERY_TYPE_TIMESTAMP: {
318       /* 8 bytes per query used for the timestamp value. We have all
319        * timestamps tightly packed first in the buffer.
320        */
321       const uint32_t bo_size = pool->query_count * 8;
322       pool->timestamp.bo = v3dv_bo_alloc(device, bo_size, "query:t", true);
323       if (!pool->timestamp.bo) {
324          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
325          goto fail;
326       }
327       if (!v3dv_bo_map(device, pool->timestamp.bo, bo_size)) {
328          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
329          goto fail;
330       }
331       break;
332    }
333    default:
334       unreachable("Unsupported query type");
335    }
336 
337    /* Initialize queries in the pool */
338    for (; query_idx < pool->query_count; query_idx++) {
339       pool->queries[query_idx].maybe_available = false;
340       switch (pool->query_type) {
341       case VK_QUERY_TYPE_OCCLUSION: {
342          const uint32_t query_group = query_idx / 16;
343          const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
344          pool->queries[query_idx].occlusion.offset = query_offset;
345          break;
346          }
347       case VK_QUERY_TYPE_TIMESTAMP:
348          pool->queries[query_idx].timestamp.offset = query_idx * 8;
349          result = vk_sync_create(&device->vk,
350                                  &device->pdevice->drm_syncobj_type, 0, 0,
351                                  &pool->queries[query_idx].timestamp.sync);
352          if (result != VK_SUCCESS)
353             goto fail;
354          break;
355       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
356          result = vk_sync_create(&device->vk,
357                                  &device->pdevice->drm_syncobj_type, 0, 0,
358                                  &pool->queries[query_idx].perf.last_job_sync);
359          if (result != VK_SUCCESS)
360             goto fail;
361 
362          kperfmon_create(device, pool, query_idx);
363          break;
364          }
365       default:
366          unreachable("Unsupported query type");
367       }
368    }
369 
370    /* Create meta resources */
371    result = pool_create_meta_resources(device, pool);
372    if (result != VK_SUCCESS)
373       goto fail;
374 
375    *pQueryPool = v3dv_query_pool_to_handle(pool);
376 
377    return VK_SUCCESS;
378 
379 fail:
380    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
381       for (uint32_t j = 0; j < query_idx; j++)
382          vk_sync_destroy(&device->vk, pool->queries[j].timestamp.sync);
383    }
384 
385    if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
386       for (uint32_t j = 0; j < query_idx; j++)
387          vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
388    }
389 
390    if (pool->occlusion.bo)
391       v3dv_bo_free(device, pool->occlusion.bo);
392    if (pool->timestamp.bo)
393       v3dv_bo_free(device, pool->timestamp.bo);
394    if (pool->queries)
395       vk_free2(&device->vk.alloc, pAllocator, pool->queries);
396    pool_destroy_meta_resources(device, pool);
397    vk_object_free(&device->vk, pAllocator, pool);
398 
399    return result;
400 }
401 
402 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyQueryPool(VkDevice _device,VkQueryPool queryPool,const VkAllocationCallbacks * pAllocator)403 v3dv_DestroyQueryPool(VkDevice _device,
404                       VkQueryPool queryPool,
405                       const VkAllocationCallbacks *pAllocator)
406 {
407    V3DV_FROM_HANDLE(v3dv_device, device, _device);
408    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
409 
410    if (!pool)
411       return;
412 
413    if (pool->occlusion.bo)
414       v3dv_bo_free(device, pool->occlusion.bo);
415 
416    if (pool->timestamp.bo)
417       v3dv_bo_free(device, pool->timestamp.bo);
418 
419    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
420       for (uint32_t i = 0; i < pool->query_count; i++)
421          vk_sync_destroy(&device->vk, pool->queries[i].timestamp.sync);
422    }
423 
424    if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
425       for (uint32_t i = 0; i < pool->query_count; i++) {
426          kperfmon_destroy(device, pool, i);
427          vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
428       }
429    }
430 
431    if (pool->queries)
432       vk_free2(&device->vk.alloc, pAllocator, pool->queries);
433 
434    pool_destroy_meta_resources(device, pool);
435 
436    vk_object_free(&device->vk, pAllocator, pool);
437 }
438 
439 static void
write_to_buffer(void * dst,uint32_t idx,bool do_64bit,uint64_t value)440 write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
441 {
442    if (do_64bit) {
443       uint64_t *dst64 = (uint64_t *) dst;
444       dst64[idx] = value;
445    } else {
446       uint32_t *dst32 = (uint32_t *) dst;
447       dst32[idx] = (uint32_t) value;
448    }
449 }
450 
451 static VkResult
query_wait_available(struct v3dv_device * device,struct v3dv_query_pool * pool,struct v3dv_query * q,uint32_t query_idx)452 query_wait_available(struct v3dv_device *device,
453                      struct v3dv_query_pool *pool,
454                      struct v3dv_query *q,
455                      uint32_t query_idx)
456 {
457    /* For occlusion queries we prefer to poll the availability BO in a loop
458     * to waiting on the query results BO, because the latter would
459     * make us wait for any job running queries from the pool, even if those
460     * queries do not involve the one we want to wait on.
461     */
462    if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
463       uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
464                         pool->occlusion.avail_offset + query_idx;
465       while (*q_addr == 0)
466          usleep(250);
467       return VK_SUCCESS;
468    }
469 
470    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
471       if (vk_sync_wait(&device->vk, q->timestamp.sync,
472                        0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
473          return vk_device_set_lost(&device->vk, "Query job wait failed");
474       }
475       return VK_SUCCESS;
476    }
477 
478    assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
479 
480    /* For performance queries we need to wait for the queue to signal that
481     * the query has been submitted for execution before anything else.
482     */
483    VkResult result = VK_SUCCESS;
484    if (!q->maybe_available) {
485       struct timespec timeout;
486       timespec_get(&timeout, TIME_UTC);
487       timespec_add_msec(&timeout, &timeout, 2000);
488 
489       mtx_lock(&device->query_mutex);
490       while (!q->maybe_available) {
491          if (vk_device_is_lost(&device->vk)) {
492             result = VK_ERROR_DEVICE_LOST;
493             break;
494          }
495 
496          int ret = cnd_timedwait(&device->query_ended,
497                                  &device->query_mutex,
498                                  &timeout);
499          if (ret != thrd_success) {
500             mtx_unlock(&device->query_mutex);
501             result = vk_device_set_lost(&device->vk, "Query wait failed");
502             break;
503          }
504       }
505       mtx_unlock(&device->query_mutex);
506 
507       if (result != VK_SUCCESS)
508          return result;
509 
510       /* For performance queries, we also need to wait for the relevant syncobj
511        * to be signaled to ensure completion of the GPU work.
512        */
513       if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
514           vk_sync_wait(&device->vk, q->perf.last_job_sync,
515                        0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
516         return vk_device_set_lost(&device->vk, "Query job wait failed");
517       }
518    }
519 
520    return result;
521 }
522 
523 static VkResult
query_check_available(struct v3dv_device * device,struct v3dv_query_pool * pool,struct v3dv_query * q,uint32_t query_idx)524 query_check_available(struct v3dv_device *device,
525                       struct v3dv_query_pool *pool,
526                       struct v3dv_query *q,
527                       uint32_t query_idx)
528 {
529    /* For occlusion we check the availability BO */
530    if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
531       const uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
532                               pool->occlusion.avail_offset + query_idx;
533       return (*q_addr != 0) ? VK_SUCCESS : VK_NOT_READY;
534    }
535 
536    /* For timestamp queries, we need to check if the relevant job
537     * has completed.
538     */
539    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
540       if (vk_sync_wait(&device->vk, q->timestamp.sync,
541                        0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
542          return VK_NOT_READY;
543       }
544       return VK_SUCCESS;
545    }
546 
547    /* For other queries we need to check if the queue has submitted the query
548     * for execution at all.
549     */
550    assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
551    if (!q->maybe_available)
552       return VK_NOT_READY;
553 
554    /* For performance queries, we also need to check if the relevant GPU job
555     * has completed.
556     */
557    if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
558        vk_sync_wait(&device->vk, q->perf.last_job_sync,
559                     0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
560          return VK_NOT_READY;
561    }
562 
563    return VK_SUCCESS;
564 }
565 
566 static VkResult
query_is_available(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_wait,bool * available)567 query_is_available(struct v3dv_device *device,
568                    struct v3dv_query_pool *pool,
569                    uint32_t query,
570                    bool do_wait,
571                    bool *available)
572 {
573    struct v3dv_query *q = &pool->queries[query];
574 
575    if (do_wait) {
576       VkResult result = query_wait_available(device, pool, q, query);
577       if (result != VK_SUCCESS) {
578          *available = false;
579          return result;
580       }
581 
582       *available = true;
583    } else {
584       VkResult result = query_check_available(device, pool, q, query);
585       assert(result == VK_SUCCESS || result == VK_NOT_READY);
586       *available = (result == VK_SUCCESS);
587    }
588 
589    return VK_SUCCESS;
590 }
591 
592 static VkResult
write_occlusion_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)593 write_occlusion_query_result(struct v3dv_device *device,
594                              struct v3dv_query_pool *pool,
595                              uint32_t query,
596                              bool do_64bit,
597                              void *data,
598                              uint32_t slot)
599 {
600    assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
601 
602    if (vk_device_is_lost(&device->vk))
603       return VK_ERROR_DEVICE_LOST;
604 
605    struct v3dv_query *q = &pool->queries[query];
606    assert(pool->occlusion.bo && pool->occlusion.bo->map);
607 
608    const uint8_t *query_addr =
609       ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
610    write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
611    return VK_SUCCESS;
612 }
613 
614 static VkResult
write_timestamp_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)615 write_timestamp_query_result(struct v3dv_device *device,
616                              struct v3dv_query_pool *pool,
617                              uint32_t query,
618                              bool do_64bit,
619                              void *data,
620                              uint32_t slot)
621 {
622    assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
623 
624    struct v3dv_query *q = &pool->queries[query];
625 
626    const uint8_t *query_addr =
627       ((uint8_t *) pool->timestamp.bo->map) + q->timestamp.offset;
628 
629    write_to_buffer(data, slot, do_64bit, *((uint64_t *)query_addr));
630    return VK_SUCCESS;
631 }
632 
633 static VkResult
write_performance_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)634 write_performance_query_result(struct v3dv_device *device,
635                                struct v3dv_query_pool *pool,
636                                uint32_t query,
637                                bool do_64bit,
638                                void *data,
639                                uint32_t slot)
640 {
641    assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
642 
643    struct v3dv_query *q = &pool->queries[query];
644    uint64_t counter_values[V3D_MAX_PERFCNT];
645 
646    assert(pool->perfmon.nperfmons);
647    assert(pool->perfmon.ncounters);
648 
649    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
650       struct drm_v3d_perfmon_get_values req = {
651          .id = q->perf.kperfmon_ids[i],
652          .values_ptr = (uintptr_t)(&counter_values[i *
653                                    DRM_V3D_MAX_PERF_COUNTERS])
654       };
655 
656       int ret = v3d_ioctl(device->pdevice->render_fd,
657                           DRM_IOCTL_V3D_PERFMON_GET_VALUES,
658                           &req);
659 
660       if (ret) {
661          mesa_loge("failed to get perfmon values: %s\n", strerror(errno));
662          return vk_error(device, VK_ERROR_DEVICE_LOST);
663       }
664    }
665 
666    for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
667       write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
668 
669    return VK_SUCCESS;
670 }
671 
672 static VkResult
write_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)673 write_query_result(struct v3dv_device *device,
674                    struct v3dv_query_pool *pool,
675                    uint32_t query,
676                    bool do_64bit,
677                    void *data,
678                    uint32_t slot)
679 {
680    switch (pool->query_type) {
681    case VK_QUERY_TYPE_OCCLUSION:
682       return write_occlusion_query_result(device, pool, query, do_64bit,
683                                           data, slot);
684    case VK_QUERY_TYPE_TIMESTAMP:
685       return write_timestamp_query_result(device, pool, query, do_64bit,
686                                           data, slot);
687    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
688       return write_performance_query_result(device, pool, query, do_64bit,
689                                             data, slot);
690    default:
691       unreachable("Unsupported query type");
692    }
693 }
694 
695 static uint32_t
get_query_result_count(struct v3dv_query_pool * pool)696 get_query_result_count(struct v3dv_query_pool *pool)
697 {
698    switch (pool->query_type) {
699    case VK_QUERY_TYPE_OCCLUSION:
700    case VK_QUERY_TYPE_TIMESTAMP:
701       return 1;
702    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
703       return pool->perfmon.ncounters;
704    default:
705       unreachable("Unsupported query type");
706    }
707 }
708 
709 VkResult
v3dv_get_query_pool_results_cpu(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,void * data,VkDeviceSize stride,VkQueryResultFlags flags)710 v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
711                                 struct v3dv_query_pool *pool,
712                                 uint32_t first,
713                                 uint32_t count,
714                                 void *data,
715                                 VkDeviceSize stride,
716                                 VkQueryResultFlags flags)
717 {
718    assert(first < pool->query_count);
719    assert(first + count <= pool->query_count);
720    assert(data);
721 
722    const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
723       pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
724    const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
725    const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
726 
727    uint32_t result_count = get_query_result_count(pool);
728 
729    VkResult result = VK_SUCCESS;
730    for (uint32_t i = first; i < first + count; i++) {
731       bool available = false;
732       VkResult query_result =
733          query_is_available(device, pool, i, do_wait, &available);
734       if (query_result == VK_ERROR_DEVICE_LOST)
735          result = VK_ERROR_DEVICE_LOST;
736 
737       /**
738        * From the Vulkan 1.0 spec:
739        *
740        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
741        *     both not set then no result values are written to pData for queries
742        *     that are in the unavailable state at the time of the call, and
743        *     vkGetQueryPoolResults returns VK_NOT_READY. However, availability
744        *     state is still written to pData for those queries if
745        *     VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
746        */
747       uint32_t slot = 0;
748 
749       const bool write_result = available || do_partial;
750       if (write_result)
751          write_query_result(device, pool, i, do_64bit, data, slot);
752       slot += result_count;
753 
754       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
755          write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
756 
757       if (!write_result && result != VK_ERROR_DEVICE_LOST)
758          result = VK_NOT_READY;
759 
760       data += stride;
761    }
762 
763    return result;
764 }
765 
766 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)767 v3dv_GetQueryPoolResults(VkDevice _device,
768                          VkQueryPool queryPool,
769                          uint32_t firstQuery,
770                          uint32_t queryCount,
771                          size_t dataSize,
772                          void *pData,
773                          VkDeviceSize stride,
774                          VkQueryResultFlags flags)
775 {
776    V3DV_FROM_HANDLE(v3dv_device, device, _device);
777    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
778 
779    if (vk_device_is_lost(&device->vk))
780       return VK_ERROR_DEVICE_LOST;
781 
782    return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount,
783                                           pData, stride, flags);
784 }
785 
786 /* Emits a series of vkCmdDispatchBase calls to execute all the workgroups
787  * required to handle a number of queries considering per-dispatch limits.
788  */
789 static void
cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer * cmd_buffer,uint32_t query_count)790 cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer *cmd_buffer,
791                                  uint32_t query_count)
792 {
793    VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
794 
795    uint32_t dispatched = 0;
796    const uint32_t max_batch_size = 65535;
797    while (dispatched < query_count) {
798       uint32_t batch_size = MIN2(query_count - dispatched, max_batch_size);
799       v3dv_CmdDispatchBase(vk_cmd_buffer, dispatched, 0, 0, batch_size, 1, 1);
800       dispatched += batch_size;
801    }
802 }
803 
804 void
v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count,uint8_t availability)805 v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
806                                             struct v3dv_query_pool *pool,
807                                             uint32_t query, uint32_t count,
808                                             uint8_t availability)
809 {
810    assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
811           pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
812 
813    struct v3dv_device *device = cmd_buffer->device;
814    VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
815 
816    /* We are about to emit a compute job to set query availability and we need
817     * to ensure this executes after the graphics work using the queries has
818     * completed.
819     */
820    VkMemoryBarrier2 barrier = {
821       .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
822       .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
823       .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
824    };
825    VkDependencyInfo barrier_info = {
826       .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
827       .memoryBarrierCount = 1,
828       .pMemoryBarriers = &barrier,
829    };
830    v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
831 
832    /* Dispatch queries */
833    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
834 
835    v3dv_CmdBindPipeline(vk_cmd_buffer,
836                         VK_PIPELINE_BIND_POINT_COMPUTE,
837                         device->queries.avail_pipeline);
838 
839    v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
840                               VK_PIPELINE_BIND_POINT_COMPUTE,
841                               device->queries.avail_pipeline_layout,
842                               0, 1, &pool->meta.descriptor_set,
843                               0, NULL);
844 
845    struct {
846       uint32_t offset;
847       uint32_t query;
848       uint8_t availability;
849    } push_data = { pool->occlusion.avail_offset, query, availability };
850    v3dv_CmdPushConstants(vk_cmd_buffer,
851                          device->queries.avail_pipeline_layout,
852                          VK_SHADER_STAGE_COMPUTE_BIT,
853                          0, sizeof(push_data), &push_data);
854    cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
855 
856    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
857 }
858 
859 static void
cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count)860 cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
861                                            struct v3dv_query_pool *pool,
862                                            uint32_t query, uint32_t count)
863 {
864    struct v3dv_device *device = cmd_buffer->device;
865    VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
866 
867    /* Ensure the GPU is done with the queries in the graphics queue before
868     * we reset in the compute queue.
869     */
870    VkMemoryBarrier2 barrier = {
871       .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
872       .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
873       .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
874    };
875    VkDependencyInfo barrier_info = {
876       .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
877       .memoryBarrierCount = 1,
878       .pMemoryBarriers = &barrier,
879    };
880    v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
881 
882    /* Emit compute reset */
883    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
884 
885    v3dv_CmdBindPipeline(vk_cmd_buffer,
886                         VK_PIPELINE_BIND_POINT_COMPUTE,
887                         device->queries.reset_occlusion_pipeline);
888 
889    v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
890                               VK_PIPELINE_BIND_POINT_COMPUTE,
891                               device->queries.reset_occlusion_pipeline_layout,
892                               0, 1, &pool->meta.descriptor_set,
893                               0, NULL);
894    struct {
895       uint32_t offset;
896       uint32_t query;
897    } push_data = { pool->occlusion.avail_offset, query };
898    v3dv_CmdPushConstants(vk_cmd_buffer,
899                          device->queries.reset_occlusion_pipeline_layout,
900                          VK_SHADER_STAGE_COMPUTE_BIT,
901                          0, sizeof(push_data), &push_data);
902 
903    cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
904 
905    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
906 
907    /* Ensure future work in the graphics queue using the queries doesn't start
908     * before the reset completed.
909     */
910    barrier = (VkMemoryBarrier2) {
911       .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
912       .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
913       .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT,
914    };
915    barrier_info = (VkDependencyInfo) {
916       .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
917       .memoryBarrierCount = 1,
918       .pMemoryBarriers = &barrier,
919    };
920    v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
921 }
922 
923 static void
cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)924 cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
925                                  struct v3dv_query_pool *pool,
926                                  uint32_t first, uint32_t count)
927 {
928    assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
929    cmd_buffer_emit_reset_occlusion_query_pool(cmd_buffer, pool, first, count);
930 }
931 
932 static void
cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)933 cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer *cmd_buffer,
934                                      struct v3dv_query_pool *pool,
935                                      uint32_t first, uint32_t count)
936 {
937    assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION);
938 
939    struct v3dv_job *job =
940       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
941                                      V3DV_JOB_TYPE_CPU_RESET_QUERIES,
942                                      cmd_buffer, -1);
943    v3dv_return_if_oom(cmd_buffer, NULL);
944    job->cpu.query_reset.pool = pool;
945    job->cpu.query_reset.first = first;
946    job->cpu.query_reset.count = count;
947    list_addtail(&job->list_link, &cmd_buffer->jobs);
948 }
949 
950 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)951 v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
952                        VkQueryPool queryPool,
953                        uint32_t firstQuery,
954                        uint32_t queryCount)
955 {
956    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
957    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
958 
959    /* Resets can only happen outside a render pass instance so we should not
960     * be in the middle of job recording.
961     */
962    assert(cmd_buffer->state.pass == NULL);
963    assert(cmd_buffer->state.job == NULL);
964 
965    assert(firstQuery < pool->query_count);
966    assert(firstQuery + queryCount <= pool->query_count);
967 
968    /* We can reset occlusion queries in the GPU, but for other query types
969     * we emit a CPU job that will call v3dv_reset_query_pool_cpu when executed
970     * in the queue.
971     */
972    if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
973       cmd_buffer_emit_reset_query_pool(cmd_buffer, pool, firstQuery, queryCount);
974    } else {
975       cmd_buffer_emit_reset_query_pool_cpu(cmd_buffer, pool,
976                                            firstQuery, queryCount);
977    }
978 }
979 
980 /**
981  * Creates a descriptor pool so we can create a descriptors for the destination
982  * buffers of vkCmdCopyQueryResults for queries where this is implemented in
983  * the GPU.
984  */
985 static VkResult
create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)986 create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
987 {
988    /* If this is not the first pool we create one for this command buffer
989     * size it based on the size of the currently exhausted pool.
990     */
991    uint32_t descriptor_count = 32;
992    if (cmd_buffer->meta.query.dspool != VK_NULL_HANDLE) {
993       struct v3dv_descriptor_pool *exhausted_pool =
994          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.query.dspool);
995       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
996    }
997 
998    /* Create the descriptor pool */
999    cmd_buffer->meta.query.dspool = VK_NULL_HANDLE;
1000    VkDescriptorPoolSize pool_size = {
1001       .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
1002       .descriptorCount = descriptor_count,
1003    };
1004    VkDescriptorPoolCreateInfo info = {
1005       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
1006       .maxSets = descriptor_count,
1007       .poolSizeCount = 1,
1008       .pPoolSizes = &pool_size,
1009       .flags = 0,
1010    };
1011    VkResult result =
1012       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
1013                                 &info,
1014                                 &cmd_buffer->device->vk.alloc,
1015                                 &cmd_buffer->meta.query.dspool);
1016 
1017    if (result == VK_SUCCESS) {
1018       assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
1019       const VkDescriptorPool vk_pool = cmd_buffer->meta.query.dspool;
1020 
1021       v3dv_cmd_buffer_add_private_obj(
1022          cmd_buffer, (uintptr_t) vk_pool,
1023          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
1024 
1025       struct v3dv_descriptor_pool *pool =
1026          v3dv_descriptor_pool_from_handle(vk_pool);
1027       pool->is_driver_internal = true;
1028    }
1029 
1030    return result;
1031 }
1032 
1033 static VkResult
allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)1034 allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
1035                                        VkDescriptorSet *set)
1036 {
1037    /* Make sure we have a descriptor pool */
1038    VkResult result;
1039    if (cmd_buffer->meta.query.dspool == VK_NULL_HANDLE) {
1040       result = create_storage_buffer_descriptor_pool(cmd_buffer);
1041       if (result != VK_SUCCESS)
1042          return result;
1043    }
1044    assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
1045 
1046    /* Allocate descriptor set */
1047    struct v3dv_device *device = cmd_buffer->device;
1048    VkDevice vk_device = v3dv_device_to_handle(device);
1049    VkDescriptorSetAllocateInfo info = {
1050       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
1051       .descriptorPool = cmd_buffer->meta.query.dspool,
1052       .descriptorSetCount = 1,
1053       .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1054    };
1055    result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
1056 
1057    /* If we ran out of pool space, grow the pool and try again */
1058    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
1059       result = create_storage_buffer_descriptor_pool(cmd_buffer);
1060       if (result == VK_SUCCESS) {
1061          info.descriptorPool = cmd_buffer->meta.query.dspool;
1062          result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
1063       }
1064    }
1065 
1066    return result;
1067 }
1068 
1069 static uint32_t
copy_pipeline_index_from_flags(VkQueryResultFlags flags)1070 copy_pipeline_index_from_flags(VkQueryResultFlags flags)
1071 {
1072    uint32_t index = 0;
1073    if (flags & VK_QUERY_RESULT_64_BIT)
1074       index |= 1;
1075    if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
1076       index |= 2;
1077    if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
1078       index |= 4;
1079    assert(index < 8);
1080    return index;
1081 }
1082 
1083 static nir_shader *
1084 get_copy_query_results_cs(const nir_shader_compiler_options *compiler_options,
1085                           VkQueryResultFlags flags);
1086 
1087 static void
cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * buf,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)1088 cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer *cmd_buffer,
1089                                         struct v3dv_query_pool *pool,
1090                                         uint32_t first, uint32_t count,
1091                                         struct v3dv_buffer *buf,
1092                                         uint32_t offset, uint32_t stride,
1093                                         VkQueryResultFlags flags)
1094 {
1095    struct v3dv_device *device = cmd_buffer->device;
1096    VkDevice vk_device = v3dv_device_to_handle(device);
1097    VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
1098 
1099    /* Create the required copy pipeline if not yet created */
1100    uint32_t pipeline_idx = copy_pipeline_index_from_flags(flags);
1101    if (!device->queries.copy_pipeline[pipeline_idx]) {
1102       const nir_shader_compiler_options *compiler_options =
1103          v3dv_pipeline_get_nir_options(&device->devinfo);
1104       nir_shader *copy_query_results_cs_nir =
1105          get_copy_query_results_cs(compiler_options, flags);
1106       VkResult result =
1107          v3dv_create_compute_pipeline_from_nir(
1108                device, copy_query_results_cs_nir,
1109                device->queries.copy_pipeline_layout,
1110                &device->queries.copy_pipeline[pipeline_idx]);
1111       ralloc_free(copy_query_results_cs_nir);
1112       if (result != VK_SUCCESS) {
1113          mesa_loge("Failed to create copy query results pipeline\n");
1114          return;
1115       }
1116    }
1117 
1118    /* FIXME: do we need this barrier? Since vkCmdEndQuery should've been called
1119     * and that already waits maybe we don't (since this is serialized
1120     * in the compute queue with EndQuery anyway).
1121     */
1122    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1123       VkMemoryBarrier2 barrier = {
1124          .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
1125          .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
1126          .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1127       };
1128       VkDependencyInfo barrier_info = {
1129          .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1130          .memoryBarrierCount = 1,
1131          .pMemoryBarriers = &barrier,
1132       };
1133       v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
1134    }
1135 
1136    /* Allocate and setup descriptor set for output buffer */
1137    VkDescriptorSet out_buf_descriptor_set;
1138    VkResult result =
1139       allocate_storage_buffer_descriptor_set(cmd_buffer,
1140                                              &out_buf_descriptor_set);
1141    if (result != VK_SUCCESS) {
1142       mesa_loge("vkCmdCopyQueryPoolResults failed: "
1143                 "could not allocate descriptor.\n");
1144       return;
1145    }
1146 
1147    VkDescriptorBufferInfo desc_buf_info = {
1148       .buffer = v3dv_buffer_to_handle(buf),
1149       .offset = 0,
1150       .range = VK_WHOLE_SIZE,
1151    };
1152    VkWriteDescriptorSet write = {
1153       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1154       .dstSet = out_buf_descriptor_set,
1155       .dstBinding = 0,
1156       .dstArrayElement = 0,
1157       .descriptorCount = 1,
1158       .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1159       .pBufferInfo = &desc_buf_info,
1160    };
1161    v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
1162 
1163    /* Dispatch copy */
1164    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
1165 
1166    assert(device->queries.copy_pipeline[pipeline_idx]);
1167    v3dv_CmdBindPipeline(vk_cmd_buffer,
1168                         VK_PIPELINE_BIND_POINT_COMPUTE,
1169                         device->queries.copy_pipeline[pipeline_idx]);
1170 
1171    VkDescriptorSet sets[2] = {
1172       pool->meta.descriptor_set,
1173       out_buf_descriptor_set,
1174    };
1175    v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
1176                               VK_PIPELINE_BIND_POINT_COMPUTE,
1177                               device->queries.copy_pipeline_layout,
1178                               0, 2, sets, 0, NULL);
1179 
1180    struct {
1181       uint32_t avail_offset, first, offset, stride, flags;
1182    } push_data = { pool->occlusion.avail_offset, first, offset, stride, flags };
1183    v3dv_CmdPushConstants(vk_cmd_buffer,
1184                          device->queries.copy_pipeline_layout,
1185                          VK_SHADER_STAGE_COMPUTE_BIT,
1186                          0, sizeof(push_data), &push_data);
1187 
1188    cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
1189 
1190    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
1191 }
1192 
1193 static void
cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * dst,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)1194 cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer *cmd_buffer,
1195                                             struct v3dv_query_pool *pool,
1196                                             uint32_t first,
1197                                             uint32_t count,
1198                                             struct v3dv_buffer *dst,
1199                                             uint32_t offset,
1200                                             uint32_t stride,
1201                                             VkQueryResultFlags flags)
1202 {
1203    struct v3dv_job *job =
1204       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
1205                                      V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
1206                                      cmd_buffer, -1);
1207    v3dv_return_if_oom(cmd_buffer, NULL);
1208 
1209    job->cpu.query_copy_results.pool = pool;
1210    job->cpu.query_copy_results.first = first;
1211    job->cpu.query_copy_results.count = count;
1212    job->cpu.query_copy_results.dst = dst;
1213    job->cpu.query_copy_results.offset = offset;
1214    job->cpu.query_copy_results.stride = stride;
1215    job->cpu.query_copy_results.flags = flags;
1216 
1217    list_addtail(&job->list_link, &cmd_buffer->jobs);
1218 }
1219 
1220 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1221 v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
1222                              VkQueryPool queryPool,
1223                              uint32_t firstQuery,
1224                              uint32_t queryCount,
1225                              VkBuffer dstBuffer,
1226                              VkDeviceSize dstOffset,
1227                              VkDeviceSize stride,
1228                              VkQueryResultFlags flags)
1229 {
1230    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1231    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1232    V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);
1233 
1234    /* Copies can only happen outside a render pass instance so we should not
1235     * be in the middle of job recording.
1236     */
1237    assert(cmd_buffer->state.pass == NULL);
1238    assert(cmd_buffer->state.job == NULL);
1239 
1240    assert(firstQuery < pool->query_count);
1241    assert(firstQuery + queryCount <= pool->query_count);
1242 
1243    /* For occlusion queries we implement the copy in the GPU but for other
1244     * queries we emit a CPU job that will call v3dv_get_query_pool_results_cpu
1245     * when executed in the queue.
1246     */
1247    if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
1248       cmd_buffer_emit_copy_query_pool_results(cmd_buffer, pool,
1249                                               firstQuery, queryCount,
1250                                               dst, (uint32_t) dstOffset,
1251                                               (uint32_t) stride, flags);
1252    } else {
1253       cmd_buffer_emit_copy_query_pool_results_cpu(cmd_buffer, pool,
1254                                                   firstQuery, queryCount,
1255                                                   dst, (uint32_t)dstOffset,
1256                                                   (uint32_t) stride, flags);
1257    }
1258 }
1259 
1260 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1261 v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
1262                    VkQueryPool queryPool,
1263                    uint32_t query,
1264                    VkQueryControlFlags flags)
1265 {
1266    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1267    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1268 
1269    v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags);
1270 }
1271 
1272 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1273 v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
1274                  VkQueryPool queryPool,
1275                  uint32_t query)
1276 {
1277    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1278    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1279 
1280    v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
1281 }
1282 
1283 void
v3dv_reset_query_pool_cpu(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)1284 v3dv_reset_query_pool_cpu(struct v3dv_device *device,
1285                           struct v3dv_query_pool *pool,
1286                           uint32_t first,
1287                           uint32_t count)
1288 {
1289    mtx_lock(&device->query_mutex);
1290 
1291    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
1292       assert(first + count <= pool->query_count);
1293 
1294       /* Reset timestamp */
1295       uint8_t *base_addr;
1296       base_addr  = ((uint8_t *) pool->timestamp.bo->map) +
1297                     pool->queries[first].timestamp.offset;
1298       memset(base_addr, 0, 8 * count);
1299 
1300       for (uint32_t i = first; i < first + count; i++) {
1301          if (vk_sync_reset(&device->vk, pool->queries[i].timestamp.sync) != VK_SUCCESS)
1302             mesa_loge("Failed to reset sync");
1303       }
1304 
1305       mtx_unlock(&device->query_mutex);
1306       return;
1307    }
1308 
1309    for (uint32_t i = first; i < first + count; i++) {
1310       assert(i < pool->query_count);
1311       struct v3dv_query *q = &pool->queries[i];
1312       q->maybe_available = false;
1313       switch (pool->query_type) {
1314       case VK_QUERY_TYPE_OCCLUSION: {
1315          /* Reset availability */
1316          uint8_t *base_addr = ((uint8_t *) pool->occlusion.bo->map) +
1317                               pool->occlusion.avail_offset + first;
1318          memset(base_addr, 0, count);
1319 
1320          /* Reset occlusion counter */
1321          const uint8_t *q_addr =
1322             ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
1323          uint32_t *counter = (uint32_t *) q_addr;
1324          *counter = 0;
1325          break;
1326       }
1327       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1328          kperfmon_destroy(device, pool, i);
1329          kperfmon_create(device, pool, i);
1330          if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
1331             mesa_loge("Failed to reset sync");
1332          break;
1333       default:
1334          unreachable("Unsupported query type");
1335       }
1336    }
1337 
1338    mtx_unlock(&device->query_mutex);
1339 }
1340 
1341 VKAPI_ATTR void VKAPI_CALL
v3dv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1342 v3dv_ResetQueryPool(VkDevice _device,
1343                     VkQueryPool queryPool,
1344                     uint32_t firstQuery,
1345                     uint32_t queryCount)
1346 {
1347    V3DV_FROM_HANDLE(v3dv_device, device, _device);
1348    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1349 
1350    v3dv_reset_query_pool_cpu(device, pool, firstQuery, queryCount);
1351 }
1352 
1353 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1354 v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1355    VkPhysicalDevice physicalDevice,
1356    uint32_t queueFamilyIndex,
1357    uint32_t *pCounterCount,
1358    VkPerformanceCounterKHR *pCounters,
1359    VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
1360 {
1361    V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
1362 
1363    uint32_t desc_count = *pCounterCount;
1364    uint8_t ncounters = pDevice->perfcntr->max_perfcnt;
1365 
1366    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
1367                           out, pCounters, pCounterCount);
1368    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
1369                           out_desc, pCounterDescriptions, &desc_count);
1370 
1371    for (int i = 0; i < ncounters; i++) {
1372       const struct v3d_perfcntr_desc *perfcntr_desc = v3d_perfcntrs_get_by_index(pDevice->perfcntr, i);
1373 
1374       vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1375          counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
1376          counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
1377          counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
1378 
1379          unsigned char sha1_result[20];
1380          _mesa_sha1_compute(perfcntr_desc->name, strlen(perfcntr_desc->name), sha1_result);
1381 
1382          memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1383       }
1384 
1385       vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
1386                                &out_desc, desc) {
1387          desc->flags = 0;
1388          snprintf(desc->name, sizeof(desc->name), "%s", perfcntr_desc->name);
1389          snprintf(desc->category, sizeof(desc->category), "%s", perfcntr_desc->category);
1390          snprintf(desc->description, sizeof(desc->description), "%s", perfcntr_desc->description);
1391       }
1392    }
1393 
1394    return vk_outarray_status(&out);
1395 }
1396 
1397 VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1398 v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1399    VkPhysicalDevice physicalDevice,
1400    const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
1401    uint32_t *pNumPasses)
1402 {
1403    *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
1404                               DRM_V3D_MAX_PERF_COUNTERS);
1405 }
1406 
1407 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AcquireProfilingLockKHR(VkDevice _device,const VkAcquireProfilingLockInfoKHR * pInfo)1408 v3dv_AcquireProfilingLockKHR(
1409    VkDevice _device,
1410    const VkAcquireProfilingLockInfoKHR *pInfo)
1411 {
1412    return VK_SUCCESS;
1413 }
1414 
1415 VKAPI_ATTR void VKAPI_CALL
v3dv_ReleaseProfilingLockKHR(VkDevice device)1416 v3dv_ReleaseProfilingLockKHR(VkDevice device)
1417 {
1418 }
1419 
1420 static inline void
nir_set_query_availability(nir_builder * b,nir_def * buf,nir_def * offset,nir_def * query_idx,nir_def * avail)1421 nir_set_query_availability(nir_builder *b,
1422                            nir_def *buf,
1423                            nir_def *offset,
1424                            nir_def *query_idx,
1425                            nir_def *avail)
1426 {
1427    offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
1428    nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1);
1429 }
1430 
1431 static inline nir_def *
nir_get_query_availability(nir_builder * b,nir_def * buf,nir_def * offset,nir_def * query_idx)1432 nir_get_query_availability(nir_builder *b,
1433                            nir_def *buf,
1434                            nir_def *offset,
1435                            nir_def *query_idx)
1436 {
1437    offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
1438    nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1);
1439    return nir_i2i32(b, avail);
1440 }
1441 
1442 static nir_shader *
get_set_query_availability_cs(const nir_shader_compiler_options * options)1443 get_set_query_availability_cs(const nir_shader_compiler_options *options)
1444 {
1445    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1446                                                   "set query availability cs");
1447 
1448    nir_def *buf =
1449       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1450                                 .desc_set = 0,
1451                                 .binding = 0,
1452                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1453 
1454    /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1455     * ever change any of these parameters we need to update how we compute the
1456     * query index here.
1457     */
1458    nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1459 
1460    nir_def *offset =
1461       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1462 
1463    nir_def *query_idx =
1464       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1465 
1466    nir_def *avail =
1467       nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1);
1468 
1469    query_idx = nir_iadd(&b, query_idx, wg_id);
1470    nir_set_query_availability(&b, buf, offset, query_idx, avail);
1471 
1472    return b.shader;
1473 }
1474 
1475 static inline nir_def *
nir_get_occlusion_counter_offset(nir_builder * b,nir_def * query_idx)1476 nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx)
1477 {
1478    nir_def *query_group = nir_udiv_imm(b, query_idx, 16);
1479    nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16);
1480    nir_def *offset =
1481       nir_iadd(b, nir_imul_imm(b, query_group, 1024),
1482                   nir_imul_imm(b, query_group_offset, 4));
1483    return offset;
1484 }
1485 
1486 static inline void
nir_reset_occlusion_counter(nir_builder * b,nir_def * buf,nir_def * query_idx)1487 nir_reset_occlusion_counter(nir_builder *b,
1488                             nir_def *buf,
1489                             nir_def *query_idx)
1490 {
1491    nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
1492    nir_def *zero = nir_imm_int(b, 0);
1493    nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4);
1494 }
1495 
1496 static inline nir_def *
nir_read_occlusion_counter(nir_builder * b,nir_def * buf,nir_def * query_idx)1497 nir_read_occlusion_counter(nir_builder *b,
1498                            nir_def *buf,
1499                            nir_def *query_idx)
1500 {
1501    nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
1502    return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4);
1503 }
1504 
1505 static nir_shader *
get_reset_occlusion_query_cs(const nir_shader_compiler_options * options)1506 get_reset_occlusion_query_cs(const nir_shader_compiler_options *options)
1507 {
1508    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1509                                                   "reset occlusion query cs");
1510 
1511    nir_def *buf =
1512       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1513                                 .desc_set = 0,
1514                                 .binding = 0,
1515                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1516 
1517    /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1518     * ever change any of these parameters we need to update how we compute the
1519     * query index here.
1520     */
1521    nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1522 
1523    nir_def *avail_offset =
1524       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1525 
1526    nir_def *base_query_idx =
1527       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1528 
1529    nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
1530 
1531    nir_set_query_availability(&b, buf, avail_offset, query_idx,
1532                               nir_imm_intN_t(&b, 0, 8));
1533    nir_reset_occlusion_counter(&b, buf, query_idx);
1534 
1535    return b.shader;
1536 }
1537 
1538 static void
write_query_buffer(nir_builder * b,nir_def * buf,nir_def ** offset,nir_def * value,bool flag_64bit)1539 write_query_buffer(nir_builder *b,
1540                    nir_def *buf,
1541                    nir_def **offset,
1542                    nir_def *value,
1543                    bool flag_64bit)
1544 {
1545    if (flag_64bit) {
1546       /* Create a 64-bit value using a vec2 with the .Y component set to 0
1547        * so we can write a 64-bit value in a single store.
1548        */
1549       nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0));
1550       nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8);
1551       *offset = nir_iadd_imm(b, *offset, 8);
1552    } else {
1553       nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4);
1554       *offset = nir_iadd_imm(b, *offset, 4);
1555    }
1556 }
1557 
1558 static nir_shader *
get_copy_query_results_cs(const nir_shader_compiler_options * options,VkQueryResultFlags flags)1559 get_copy_query_results_cs(const nir_shader_compiler_options *options,
1560                           VkQueryResultFlags flags)
1561 {
1562    bool flag_64bit = flags & VK_QUERY_RESULT_64_BIT;
1563    bool flag_avail = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
1564    bool flag_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
1565 
1566    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1567                                                   "copy query results cs");
1568 
1569    nir_def *buf =
1570       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1571                                 .desc_set = 0,
1572                                 .binding = 0,
1573                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1574 
1575    nir_def *buf_out =
1576       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1577                                 .desc_set = 1,
1578                                 .binding = 0,
1579                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1580 
1581    /* Read push constants */
1582    nir_def *avail_offset =
1583       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1584 
1585    nir_def *base_query_idx =
1586       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1587 
1588    nir_def *base_offset_out =
1589       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4);
1590 
1591    nir_def *stride =
1592       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4);
1593 
1594    /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1595     * ever change any of these parameters we need to update how we compute the
1596     * query index here.
1597     */
1598    nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1599    nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
1600 
1601    /* Read query availability if needed */
1602    nir_def *avail = NULL;
1603    if (flag_avail || !flag_partial)
1604       avail = nir_get_query_availability(&b, buf, avail_offset, query_idx);
1605 
1606    /* Write occusion query result... */
1607    nir_def *offset =
1608       nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride));
1609 
1610    /* ...if partial is requested, we always write */
1611    if(flag_partial) {
1612       nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
1613       write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
1614    } else {
1615       /*...otherwise, we only write if the query is available */
1616       nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0));
1617          nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
1618          write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
1619       nir_pop_if(&b, if_stmt);
1620    }
1621 
1622    /* Write query availability */
1623    if (flag_avail)
1624       write_query_buffer(&b, buf_out, &offset, avail, flag_64bit);
1625 
1626    return b.shader;
1627 }
1628 
1629 static bool
create_query_pipelines(struct v3dv_device * device)1630 create_query_pipelines(struct v3dv_device *device)
1631 {
1632    VkResult result;
1633    VkPipeline pipeline;
1634 
1635    /* Set layout: single storage buffer */
1636    if (!device->queries.buf_descriptor_set_layout) {
1637       VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
1638          .binding = 0,
1639          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1640          .descriptorCount = 1,
1641          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1642       };
1643       VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
1644          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
1645          .bindingCount = 1,
1646          .pBindings = &descriptor_set_layout_binding,
1647       };
1648       result =
1649          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
1650                                         &descriptor_set_layout_info,
1651                                         &device->vk.alloc,
1652                                         &device->queries.buf_descriptor_set_layout);
1653       if (result != VK_SUCCESS)
1654          return false;
1655    }
1656 
1657    /* Set availability pipeline.
1658     *
1659     * Pipeline layout:
1660     *  - 1 storage buffer for the BO with the query availability.
1661     *  - 2 push constants:
1662     *    0B: offset of the availability info in the buffer (4 bytes)
1663     *    4B: base query index (4 bytes).
1664     *    8B: availability (1 byte).
1665     */
1666    if (!device->queries.avail_pipeline_layout) {
1667       VkPipelineLayoutCreateInfo pipeline_layout_info = {
1668          .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1669          .setLayoutCount = 1,
1670          .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1671          .pushConstantRangeCount = 1,
1672          .pPushConstantRanges =
1673              &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 9 },
1674       };
1675 
1676       result =
1677          v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1678                                    &pipeline_layout_info,
1679                                    &device->vk.alloc,
1680                                    &device->queries.avail_pipeline_layout);
1681 
1682       if (result != VK_SUCCESS)
1683          return false;
1684    }
1685 
1686    const nir_shader_compiler_options *compiler_options =
1687       v3dv_pipeline_get_nir_options(&device->devinfo);
1688 
1689    if (!device->queries.avail_pipeline) {
1690       nir_shader *set_query_availability_cs_nir =
1691          get_set_query_availability_cs(compiler_options);
1692       result = v3dv_create_compute_pipeline_from_nir(device,
1693                                                      set_query_availability_cs_nir,
1694                                                      device->queries.avail_pipeline_layout,
1695                                                      &pipeline);
1696       ralloc_free(set_query_availability_cs_nir);
1697       if (result != VK_SUCCESS)
1698          return false;
1699 
1700       device->queries.avail_pipeline = pipeline;
1701    }
1702 
1703    /* Reset occlusion query pipeline.
1704     *
1705     * Pipeline layout:
1706     *  - 1 storage buffer for the BO with the occlusion and availability data.
1707     *  - Push constants:
1708     *    0B: offset of the availability info in the buffer (4B)
1709     *    4B: base query index (4B)
1710     */
1711    if (!device->queries.reset_occlusion_pipeline_layout) {
1712       VkPipelineLayoutCreateInfo pipeline_layout_info = {
1713          .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1714          .setLayoutCount = 1,
1715          .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1716          .pushConstantRangeCount = 1,
1717          .pPushConstantRanges =
1718              &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 8 },
1719       };
1720 
1721       result =
1722          v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1723                                    &pipeline_layout_info,
1724                                    &device->vk.alloc,
1725                                    &device->queries.reset_occlusion_pipeline_layout);
1726 
1727       if (result != VK_SUCCESS)
1728          return false;
1729    }
1730 
1731    if (!device->queries.reset_occlusion_pipeline) {
1732       nir_shader *reset_occlusion_query_cs_nir =
1733          get_reset_occlusion_query_cs(compiler_options);
1734       result = v3dv_create_compute_pipeline_from_nir(
1735                   device,
1736                   reset_occlusion_query_cs_nir,
1737                   device->queries.reset_occlusion_pipeline_layout,
1738                   &pipeline);
1739       ralloc_free(reset_occlusion_query_cs_nir);
1740       if (result != VK_SUCCESS)
1741          return false;
1742 
1743       device->queries.reset_occlusion_pipeline = pipeline;
1744    }
1745 
1746    /* Copy query results pipelines.
1747     *
1748     * Pipeline layout:
1749     *  - 1 storage buffer for the BO with the query availability and occlusion.
1750     *  - 1 storage buffer for the output.
1751     *  - Push constants:
1752     *    0B: offset of the availability info in the buffer (4B)
1753     *    4B: base query index (4B)
1754     *    8B: offset into output buffer (4B)
1755     *    12B: stride (4B)
1756     *
1757     * We create multiple specialized pipelines depending on the copy flags
1758     * to remove conditionals from the copy shader and get more optimized
1759     * pipelines.
1760     */
1761    if (!device->queries.copy_pipeline_layout) {
1762       VkDescriptorSetLayout set_layouts[2] = {
1763          device->queries.buf_descriptor_set_layout,
1764          device->queries.buf_descriptor_set_layout
1765       };
1766       VkPipelineLayoutCreateInfo pipeline_layout_info = {
1767          .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1768          .setLayoutCount = 2,
1769          .pSetLayouts = set_layouts,
1770          .pushConstantRangeCount = 1,
1771          .pPushConstantRanges =
1772              &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 16 },
1773       };
1774 
1775       result =
1776          v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1777                                    &pipeline_layout_info,
1778                                    &device->vk.alloc,
1779                                    &device->queries.copy_pipeline_layout);
1780 
1781       if (result != VK_SUCCESS)
1782          return false;
1783    }
1784 
1785    /* Actual copy pipelines are created lazily on demand since there can be up
1786     * to 8 depending on the flags used, however it is likely that applications
1787     * will use the same flags every time and only one pipeline is required.
1788     */
1789 
1790    return true;
1791 }
1792 
1793 static void
destroy_query_pipelines(struct v3dv_device * device)1794 destroy_query_pipelines(struct v3dv_device *device)
1795 {
1796    VkDevice _device = v3dv_device_to_handle(device);
1797 
1798    /* Availability pipeline */
1799    v3dv_DestroyPipeline(_device, device->queries.avail_pipeline,
1800                          &device->vk.alloc);
1801    device->queries.avail_pipeline = VK_NULL_HANDLE;
1802    v3dv_DestroyPipelineLayout(_device, device->queries.avail_pipeline_layout,
1803                               &device->vk.alloc);
1804    device->queries.avail_pipeline_layout = VK_NULL_HANDLE;
1805 
1806    /* Reset occlusion pipeline */
1807    v3dv_DestroyPipeline(_device, device->queries.reset_occlusion_pipeline,
1808                          &device->vk.alloc);
1809    device->queries.reset_occlusion_pipeline = VK_NULL_HANDLE;
1810    v3dv_DestroyPipelineLayout(_device,
1811                               device->queries.reset_occlusion_pipeline_layout,
1812                               &device->vk.alloc);
1813    device->queries.reset_occlusion_pipeline_layout = VK_NULL_HANDLE;
1814 
1815    /* Copy pipelines */
1816    for (int i = 0; i < 8; i++) {
1817       v3dv_DestroyPipeline(_device, device->queries.copy_pipeline[i],
1818                             &device->vk.alloc);
1819       device->queries.copy_pipeline[i] = VK_NULL_HANDLE;
1820    }
1821    v3dv_DestroyPipelineLayout(_device, device->queries.copy_pipeline_layout,
1822                               &device->vk.alloc);
1823    device->queries.copy_pipeline_layout = VK_NULL_HANDLE;
1824 
1825    v3dv_DestroyDescriptorSetLayout(_device,
1826                                    device->queries.buf_descriptor_set_layout,
1827                                    &device->vk.alloc);
1828    device->queries.buf_descriptor_set_layout = VK_NULL_HANDLE;
1829 }
1830 
1831 /**
1832  * Allocates device resources for implementing certain types of queries.
1833  */
1834 VkResult
v3dv_query_allocate_resources(struct v3dv_device * device)1835 v3dv_query_allocate_resources(struct v3dv_device *device)
1836 {
1837    if (!create_query_pipelines(device))
1838       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1839 
1840    return VK_SUCCESS;
1841 }
1842 
1843 void
v3dv_query_free_resources(struct v3dv_device * device)1844 v3dv_query_free_resources(struct v3dv_device *device)
1845 {
1846    destroy_query_pipelines(device);
1847 }
1848