• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 
26 #include "util/timespec.h"
27 #include "compiler/nir/nir_builder.h"
28 
29 static void
kperfmon_create(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query)30 kperfmon_create(struct v3dv_device *device,
31                 struct v3dv_query_pool *pool,
32                 uint32_t query)
33 {
34    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
35       assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
36 
37       struct drm_v3d_perfmon_create req = {
38          .ncounters = MIN2(pool->perfmon.ncounters -
39                            i * DRM_V3D_MAX_PERF_COUNTERS,
40                            DRM_V3D_MAX_PERF_COUNTERS),
41       };
42       memcpy(req.counters,
43              &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
44              req.ncounters);
45 
46       int ret = v3dv_ioctl(device->pdevice->render_fd,
47                            DRM_IOCTL_V3D_PERFMON_CREATE,
48                            &req);
49       if (ret)
50          fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));
51 
52       pool->queries[query].perf.kperfmon_ids[i] = req.id;
53    }
54 }
55 
56 static void
kperfmon_destroy(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query)57 kperfmon_destroy(struct v3dv_device *device,
58                  struct v3dv_query_pool *pool,
59                  uint32_t query)
60 {
61    /* Skip destroying if never created */
62    if (!pool->queries[query].perf.kperfmon_ids[0])
63       return;
64 
65    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
66       struct drm_v3d_perfmon_destroy req = {
67          .id = pool->queries[query].perf.kperfmon_ids[i]
68       };
69 
70       int ret = v3dv_ioctl(device->pdevice->render_fd,
71                            DRM_IOCTL_V3D_PERFMON_DESTROY,
72                            &req);
73 
74       if (ret) {
75          fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
76                  req.id, strerror(ret));
77       }
78    }
79 }
80 
81 /**
82  * Creates a VkBuffer (and VkDeviceMemory) to access a BO.
83  */
84 static VkResult
create_vk_storage_buffer(struct v3dv_device * device,struct v3dv_bo * bo,VkBuffer * vk_buf,VkDeviceMemory * vk_mem)85 create_vk_storage_buffer(struct v3dv_device *device,
86                          struct v3dv_bo *bo,
87                          VkBuffer *vk_buf,
88                          VkDeviceMemory *vk_mem)
89 {
90    VkDevice vk_device = v3dv_device_to_handle(device);
91 
92    VkBufferCreateInfo buf_info = {
93       .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
94       .size = bo->size,
95       .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
96    };
97    VkResult result = v3dv_CreateBuffer(vk_device, &buf_info, NULL, vk_buf);
98    if (result != VK_SUCCESS)
99       return result;
100 
101    struct v3dv_device_memory *mem =
102       vk_object_zalloc(&device->vk, NULL, sizeof(*mem),
103                        VK_OBJECT_TYPE_DEVICE_MEMORY);
104    if (!mem)
105       return VK_ERROR_OUT_OF_HOST_MEMORY;
106 
107    mem->bo = bo;
108    mem->type = &device->pdevice->memory.memoryTypes[0];
109 
110    *vk_mem = v3dv_device_memory_to_handle(mem);
111    VkBindBufferMemoryInfo bind_info = {
112       .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
113       .buffer = *vk_buf,
114       .memory = *vk_mem,
115       .memoryOffset = 0,
116    };
117    v3dv_BindBufferMemory2(vk_device, 1, &bind_info);
118 
119    return VK_SUCCESS;
120 }
121 
122 static void
destroy_vk_storage_buffer(struct v3dv_device * device,VkBuffer * vk_buf,VkDeviceMemory * vk_mem)123 destroy_vk_storage_buffer(struct v3dv_device *device,
124                           VkBuffer *vk_buf,
125                           VkDeviceMemory *vk_mem)
126 {
127    if (*vk_mem) {
128       vk_object_free(&device->vk, NULL, v3dv_device_memory_from_handle(*vk_mem));
129       *vk_mem = VK_NULL_HANDLE;
130    }
131 
132    v3dv_DestroyBuffer(v3dv_device_to_handle(device), *vk_buf, NULL);
133    *vk_buf = VK_NULL_HANDLE;
134 }
135 
136 /**
137  * Allocates descriptor sets to access query pool BO (availability and
138  * occlusion query results) from Vulkan pipelines.
139  */
140 static VkResult
create_pool_descriptors(struct v3dv_device * device,struct v3dv_query_pool * pool)141 create_pool_descriptors(struct v3dv_device *device,
142                         struct v3dv_query_pool *pool)
143 {
144    assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
145    VkDevice vk_device = v3dv_device_to_handle(device);
146 
147    VkDescriptorPoolSize pool_size = {
148       .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
149       .descriptorCount = 1,
150    };
151    VkDescriptorPoolCreateInfo pool_info = {
152       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
153       .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
154       .maxSets = 1,
155       .poolSizeCount = 1,
156       .pPoolSizes = &pool_size,
157    };
158    VkResult result =
159       v3dv_CreateDescriptorPool(vk_device, &pool_info, NULL,
160                                 &pool->meta.descriptor_pool);
161 
162    if (result != VK_SUCCESS)
163       return result;
164 
165    VkDescriptorSetAllocateInfo alloc_info = {
166       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
167       .descriptorPool = pool->meta.descriptor_pool,
168       .descriptorSetCount = 1,
169       .pSetLayouts = &device->queries.buf_descriptor_set_layout,
170    };
171    result = v3dv_AllocateDescriptorSets(vk_device, &alloc_info,
172                                         &pool->meta.descriptor_set);
173    if (result != VK_SUCCESS)
174       return result;
175 
176    VkDescriptorBufferInfo desc_buf_info = {
177       .buffer = pool->meta.buf,
178       .offset = 0,
179       .range = VK_WHOLE_SIZE,
180    };
181 
182    VkWriteDescriptorSet write = {
183       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
184       .dstSet = pool->meta.descriptor_set,
185       .dstBinding = 0,
186       .dstArrayElement = 0,
187       .descriptorCount = 1,
188       .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
189       .pBufferInfo = &desc_buf_info,
190    };
191    v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
192 
193    return VK_SUCCESS;
194 }
195 
196 static void
destroy_pool_descriptors(struct v3dv_device * device,struct v3dv_query_pool * pool)197 destroy_pool_descriptors(struct v3dv_device *device,
198                          struct v3dv_query_pool *pool)
199 {
200    assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
201 
202    v3dv_FreeDescriptorSets(v3dv_device_to_handle(device),
203                            pool->meta.descriptor_pool,
204                            1, &pool->meta.descriptor_set);
205    pool->meta.descriptor_set = VK_NULL_HANDLE;
206 
207    v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device),
208                               pool->meta.descriptor_pool, NULL);
209    pool->meta.descriptor_pool = VK_NULL_HANDLE;
210 }
211 
212 static VkResult
pool_create_meta_resources(struct v3dv_device * device,struct v3dv_query_pool * pool)213 pool_create_meta_resources(struct v3dv_device *device,
214                            struct v3dv_query_pool *pool)
215 {
216    VkResult result;
217 
218    if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
219       return VK_SUCCESS;
220 
221    result = create_vk_storage_buffer(device, pool->occlusion.bo,
222                                      &pool->meta.buf, &pool->meta.mem);
223    if (result != VK_SUCCESS)
224       return result;
225 
226    result = create_pool_descriptors(device, pool);
227    if (result != VK_SUCCESS)
228        return result;
229 
230    return VK_SUCCESS;
231 }
232 
233 static void
pool_destroy_meta_resources(struct v3dv_device * device,struct v3dv_query_pool * pool)234 pool_destroy_meta_resources(struct v3dv_device *device,
235                             struct v3dv_query_pool *pool)
236 {
237    if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
238       return;
239 
240    destroy_pool_descriptors(device, pool);
241    destroy_vk_storage_buffer(device, &pool->meta.buf, &pool->meta.mem);
242 }
243 
244 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)245 v3dv_CreateQueryPool(VkDevice _device,
246                      const VkQueryPoolCreateInfo *pCreateInfo,
247                      const VkAllocationCallbacks *pAllocator,
248                      VkQueryPool *pQueryPool)
249 {
250    V3DV_FROM_HANDLE(v3dv_device, device, _device);
251 
252    assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
253           pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
254           pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
255    assert(pCreateInfo->queryCount > 0);
256 
257    struct v3dv_query_pool *pool =
258       vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
259                        VK_OBJECT_TYPE_QUERY_POOL);
260    if (pool == NULL)
261       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
262 
263    pool->query_type = pCreateInfo->queryType;
264    pool->query_count = pCreateInfo->queryCount;
265 
266    uint32_t query_idx = 0;
267    VkResult result;
268 
269    const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
270    pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
271                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
272    if (pool->queries == NULL) {
273       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
274       goto fail;
275    }
276 
277    switch (pool->query_type) {
278    case VK_QUERY_TYPE_OCCLUSION: {
279       /* The hardware allows us to setup groups of 16 queries in consecutive
280        * 4-byte addresses, requiring only that each group of 16 queries is
281        * aligned to a 1024 byte boundary.
282        */
283       const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
284       uint32_t bo_size = query_groups * 1024;
285       /* After the counters we store avalability data, 1 byte/query */
286       pool->occlusion.avail_offset = bo_size;
287       bo_size += pool->query_count;
288       pool->occlusion.bo = v3dv_bo_alloc(device, bo_size, "query:o", true);
289       if (!pool->occlusion.bo) {
290          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
291          goto fail;
292       }
293       if (!v3dv_bo_map(device, pool->occlusion.bo, bo_size)) {
294          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
295          goto fail;
296       }
297       break;
298    }
299    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
300       const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
301          vk_find_struct_const(pCreateInfo->pNext,
302                               QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
303 
304       assert(pq_info);
305 
306       pool->perfmon.ncounters = pq_info->counterIndexCount;
307       for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
308          pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
309 
310       pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
311                                              DRM_V3D_MAX_PERF_COUNTERS);
312 
313       assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
314       break;
315    }
316    case VK_QUERY_TYPE_TIMESTAMP: {
317       /* 8 bytes per query used for the timestamp value. We have all
318        * timestamps tightly packed first in the buffer.
319        */
320       const uint32_t bo_size = pool->query_count * 8;
321       pool->timestamp.bo = v3dv_bo_alloc(device, bo_size, "query:t", true);
322       if (!pool->timestamp.bo) {
323          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
324          goto fail;
325       }
326       if (!v3dv_bo_map(device, pool->timestamp.bo, bo_size)) {
327          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
328          goto fail;
329       }
330       break;
331    }
332    default:
333       unreachable("Unsupported query type");
334    }
335 
336    /* Initialize queries in the pool */
337    for (; query_idx < pool->query_count; query_idx++) {
338       pool->queries[query_idx].maybe_available = false;
339       switch (pool->query_type) {
340       case VK_QUERY_TYPE_OCCLUSION: {
341          const uint32_t query_group = query_idx / 16;
342          const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
343          pool->queries[query_idx].occlusion.offset = query_offset;
344          break;
345          }
346       case VK_QUERY_TYPE_TIMESTAMP:
347          pool->queries[query_idx].timestamp.offset = query_idx * 8;
348          result = vk_sync_create(&device->vk,
349                                  &device->pdevice->drm_syncobj_type, 0, 0,
350                                  &pool->queries[query_idx].timestamp.sync);
351          if (result != VK_SUCCESS)
352             goto fail;
353          break;
354       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
355          result = vk_sync_create(&device->vk,
356                                  &device->pdevice->drm_syncobj_type, 0, 0,
357                                  &pool->queries[query_idx].perf.last_job_sync);
358          if (result != VK_SUCCESS)
359             goto fail;
360 
361          kperfmon_create(device, pool, query_idx);
362          break;
363          }
364       default:
365          unreachable("Unsupported query type");
366       }
367    }
368 
369    /* Create meta resources */
370    result = pool_create_meta_resources(device, pool);
371    if (result != VK_SUCCESS)
372       goto fail;
373 
374    *pQueryPool = v3dv_query_pool_to_handle(pool);
375 
376    return VK_SUCCESS;
377 
378 fail:
379    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
380       for (uint32_t j = 0; j < query_idx; j++)
381          vk_sync_destroy(&device->vk, pool->queries[j].timestamp.sync);
382    }
383 
384    if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
385       for (uint32_t j = 0; j < query_idx; j++)
386          vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
387    }
388 
389    if (pool->occlusion.bo)
390       v3dv_bo_free(device, pool->occlusion.bo);
391    if (pool->timestamp.bo)
392       v3dv_bo_free(device, pool->timestamp.bo);
393    if (pool->queries)
394       vk_free2(&device->vk.alloc, pAllocator, pool->queries);
395    pool_destroy_meta_resources(device, pool);
396    vk_object_free(&device->vk, pAllocator, pool);
397 
398    return result;
399 }
400 
401 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyQueryPool(VkDevice _device,VkQueryPool queryPool,const VkAllocationCallbacks * pAllocator)402 v3dv_DestroyQueryPool(VkDevice _device,
403                       VkQueryPool queryPool,
404                       const VkAllocationCallbacks *pAllocator)
405 {
406    V3DV_FROM_HANDLE(v3dv_device, device, _device);
407    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
408 
409    if (!pool)
410       return;
411 
412    if (pool->occlusion.bo)
413       v3dv_bo_free(device, pool->occlusion.bo);
414 
415    if (pool->timestamp.bo)
416       v3dv_bo_free(device, pool->timestamp.bo);
417 
418    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
419       for (uint32_t i = 0; i < pool->query_count; i++)
420          vk_sync_destroy(&device->vk, pool->queries[i].timestamp.sync);
421    }
422 
423    if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
424       for (uint32_t i = 0; i < pool->query_count; i++) {
425          kperfmon_destroy(device, pool, i);
426          vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
427       }
428    }
429 
430    if (pool->queries)
431       vk_free2(&device->vk.alloc, pAllocator, pool->queries);
432 
433    pool_destroy_meta_resources(device, pool);
434 
435    vk_object_free(&device->vk, pAllocator, pool);
436 }
437 
438 static void
write_to_buffer(void * dst,uint32_t idx,bool do_64bit,uint64_t value)439 write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
440 {
441    if (do_64bit) {
442       uint64_t *dst64 = (uint64_t *) dst;
443       dst64[idx] = value;
444    } else {
445       uint32_t *dst32 = (uint32_t *) dst;
446       dst32[idx] = (uint32_t) value;
447    }
448 }
449 
450 static VkResult
query_wait_available(struct v3dv_device * device,struct v3dv_query_pool * pool,struct v3dv_query * q,uint32_t query_idx)451 query_wait_available(struct v3dv_device *device,
452                      struct v3dv_query_pool *pool,
453                      struct v3dv_query *q,
454                      uint32_t query_idx)
455 {
456    /* For occlusion queries we prefer to poll the availability BO in a loop
457     * to waiting on the query results BO, because the latter would
458     * make us wait for any job running queries from the pool, even if those
459     * queries do not involve the one we want to wait on.
460     */
461    if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
462       uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
463                         pool->occlusion.avail_offset + query_idx;
464       while (*q_addr == 0)
465          usleep(250);
466       return VK_SUCCESS;
467    }
468 
469    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
470       if (vk_sync_wait(&device->vk, q->timestamp.sync,
471                        0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
472          return vk_device_set_lost(&device->vk, "Query job wait failed");
473       }
474       return VK_SUCCESS;
475    }
476 
477    assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
478 
479    /* For performance queries we need to wait for the queue to signal that
480     * the query has been submitted for execution before anything else.
481     */
482    VkResult result = VK_SUCCESS;
483    if (!q->maybe_available) {
484       struct timespec timeout;
485       timespec_get(&timeout, TIME_UTC);
486       timespec_add_msec(&timeout, &timeout, 2000);
487 
488       mtx_lock(&device->query_mutex);
489       while (!q->maybe_available) {
490          if (vk_device_is_lost(&device->vk)) {
491             result = VK_ERROR_DEVICE_LOST;
492             break;
493          }
494 
495          int ret = cnd_timedwait(&device->query_ended,
496                                  &device->query_mutex,
497                                  &timeout);
498          if (ret != thrd_success) {
499             mtx_unlock(&device->query_mutex);
500             result = vk_device_set_lost(&device->vk, "Query wait failed");
501             break;
502          }
503       }
504       mtx_unlock(&device->query_mutex);
505 
506       if (result != VK_SUCCESS)
507          return result;
508 
509       /* For performance queries, we also need to wait for the relevant syncobj
510        * to be signaled to ensure completion of the GPU work.
511        */
512       if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
513           vk_sync_wait(&device->vk, q->perf.last_job_sync,
514                        0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
515         return vk_device_set_lost(&device->vk, "Query job wait failed");
516       }
517    }
518 
519    return result;
520 }
521 
522 static VkResult
query_check_available(struct v3dv_device * device,struct v3dv_query_pool * pool,struct v3dv_query * q,uint32_t query_idx)523 query_check_available(struct v3dv_device *device,
524                       struct v3dv_query_pool *pool,
525                       struct v3dv_query *q,
526                       uint32_t query_idx)
527 {
528    /* For occlusion we check the availability BO */
529    if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
530       const uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
531                               pool->occlusion.avail_offset + query_idx;
532       return (*q_addr != 0) ? VK_SUCCESS : VK_NOT_READY;
533    }
534 
535    /* For timestamp queries, we need to check if the relevant job
536     * has completed.
537     */
538    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
539       if (vk_sync_wait(&device->vk, q->timestamp.sync,
540                        0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
541          return VK_NOT_READY;
542       }
543       return VK_SUCCESS;
544    }
545 
546    /* For other queries we need to check if the queue has submitted the query
547     * for execution at all.
548     */
549    assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
550    if (!q->maybe_available)
551       return VK_NOT_READY;
552 
553    /* For performance queries, we also need to check if the relevant GPU job
554     * has completed.
555     */
556    if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
557        vk_sync_wait(&device->vk, q->perf.last_job_sync,
558                     0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
559          return VK_NOT_READY;
560    }
561 
562    return VK_SUCCESS;
563 }
564 
565 static VkResult
query_is_available(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_wait,bool * available)566 query_is_available(struct v3dv_device *device,
567                    struct v3dv_query_pool *pool,
568                    uint32_t query,
569                    bool do_wait,
570                    bool *available)
571 {
572    struct v3dv_query *q = &pool->queries[query];
573 
574    if (do_wait) {
575       VkResult result = query_wait_available(device, pool, q, query);
576       if (result != VK_SUCCESS) {
577          *available = false;
578          return result;
579       }
580 
581       *available = true;
582    } else {
583       VkResult result = query_check_available(device, pool, q, query);
584       assert(result == VK_SUCCESS || result == VK_NOT_READY);
585       *available = (result == VK_SUCCESS);
586    }
587 
588    return VK_SUCCESS;
589 }
590 
591 static VkResult
write_occlusion_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)592 write_occlusion_query_result(struct v3dv_device *device,
593                              struct v3dv_query_pool *pool,
594                              uint32_t query,
595                              bool do_64bit,
596                              void *data,
597                              uint32_t slot)
598 {
599    assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
600 
601    if (vk_device_is_lost(&device->vk))
602       return VK_ERROR_DEVICE_LOST;
603 
604    struct v3dv_query *q = &pool->queries[query];
605    assert(pool->occlusion.bo && pool->occlusion.bo->map);
606 
607    const uint8_t *query_addr =
608       ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
609    write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
610    return VK_SUCCESS;
611 }
612 
613 static VkResult
write_timestamp_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)614 write_timestamp_query_result(struct v3dv_device *device,
615                              struct v3dv_query_pool *pool,
616                              uint32_t query,
617                              bool do_64bit,
618                              void *data,
619                              uint32_t slot)
620 {
621    assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
622 
623    struct v3dv_query *q = &pool->queries[query];
624 
625    const uint8_t *query_addr =
626       ((uint8_t *) pool->timestamp.bo->map) + q->timestamp.offset;
627 
628    write_to_buffer(data, slot, do_64bit, *((uint64_t *)query_addr));
629    return VK_SUCCESS;
630 }
631 
632 static VkResult
write_performance_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)633 write_performance_query_result(struct v3dv_device *device,
634                                struct v3dv_query_pool *pool,
635                                uint32_t query,
636                                bool do_64bit,
637                                void *data,
638                                uint32_t slot)
639 {
640    assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
641 
642    struct v3dv_query *q = &pool->queries[query];
643    uint64_t counter_values[V3D_MAX_PERFCNT];
644 
645    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
646       struct drm_v3d_perfmon_get_values req = {
647          .id = q->perf.kperfmon_ids[i],
648          .values_ptr = (uintptr_t)(&counter_values[i *
649                                    DRM_V3D_MAX_PERF_COUNTERS])
650       };
651 
652       int ret = v3dv_ioctl(device->pdevice->render_fd,
653                            DRM_IOCTL_V3D_PERFMON_GET_VALUES,
654                            &req);
655 
656       if (ret) {
657          fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
658          return vk_error(device, VK_ERROR_DEVICE_LOST);
659       }
660    }
661 
662    for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
663       write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
664 
665    return VK_SUCCESS;
666 }
667 
668 static VkResult
write_query_result(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t query,bool do_64bit,void * data,uint32_t slot)669 write_query_result(struct v3dv_device *device,
670                    struct v3dv_query_pool *pool,
671                    uint32_t query,
672                    bool do_64bit,
673                    void *data,
674                    uint32_t slot)
675 {
676    switch (pool->query_type) {
677    case VK_QUERY_TYPE_OCCLUSION:
678       return write_occlusion_query_result(device, pool, query, do_64bit,
679                                           data, slot);
680    case VK_QUERY_TYPE_TIMESTAMP:
681       return write_timestamp_query_result(device, pool, query, do_64bit,
682                                           data, slot);
683    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
684       return write_performance_query_result(device, pool, query, do_64bit,
685                                             data, slot);
686    default:
687       unreachable("Unsupported query type");
688    }
689 }
690 
691 static uint32_t
get_query_result_count(struct v3dv_query_pool * pool)692 get_query_result_count(struct v3dv_query_pool *pool)
693 {
694    switch (pool->query_type) {
695    case VK_QUERY_TYPE_OCCLUSION:
696    case VK_QUERY_TYPE_TIMESTAMP:
697       return 1;
698    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
699       return pool->perfmon.ncounters;
700    default:
701       unreachable("Unsupported query type");
702    }
703 }
704 
705 VkResult
v3dv_get_query_pool_results_cpu(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,void * data,VkDeviceSize stride,VkQueryResultFlags flags)706 v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
707                                 struct v3dv_query_pool *pool,
708                                 uint32_t first,
709                                 uint32_t count,
710                                 void *data,
711                                 VkDeviceSize stride,
712                                 VkQueryResultFlags flags)
713 {
714    assert(first < pool->query_count);
715    assert(first + count <= pool->query_count);
716    assert(data);
717 
718    const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
719       pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
720    const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
721    const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
722 
723    uint32_t result_count = get_query_result_count(pool);
724 
725    VkResult result = VK_SUCCESS;
726    for (uint32_t i = first; i < first + count; i++) {
727       bool available = false;
728       VkResult query_result =
729          query_is_available(device, pool, i, do_wait, &available);
730       if (query_result == VK_ERROR_DEVICE_LOST)
731          result = VK_ERROR_DEVICE_LOST;
732 
733       /**
734        * From the Vulkan 1.0 spec:
735        *
736        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
737        *     both not set then no result values are written to pData for queries
738        *     that are in the unavailable state at the time of the call, and
739        *     vkGetQueryPoolResults returns VK_NOT_READY. However, availability
740        *     state is still written to pData for those queries if
741        *     VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
742        */
743       uint32_t slot = 0;
744 
745       const bool write_result = available || do_partial;
746       if (write_result)
747          write_query_result(device, pool, i, do_64bit, data, slot);
748       slot += result_count;
749 
750       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
751          write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
752 
753       if (!write_result && result != VK_ERROR_DEVICE_LOST)
754          result = VK_NOT_READY;
755 
756       data += stride;
757    }
758 
759    return result;
760 }
761 
762 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)763 v3dv_GetQueryPoolResults(VkDevice _device,
764                          VkQueryPool queryPool,
765                          uint32_t firstQuery,
766                          uint32_t queryCount,
767                          size_t dataSize,
768                          void *pData,
769                          VkDeviceSize stride,
770                          VkQueryResultFlags flags)
771 {
772    V3DV_FROM_HANDLE(v3dv_device, device, _device);
773    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
774 
775    return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount,
776                                           pData, stride, flags);
777 }
778 
779 /* Emits a series of vkCmdDispatchBase calls to execute all the workgroups
780  * required to handle a number of queries considering per-dispatch limits.
781  */
782 static void
cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer * cmd_buffer,uint32_t query_count)783 cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer *cmd_buffer,
784                                  uint32_t query_count)
785 {
786    VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
787 
788    uint32_t dispatched = 0;
789    const uint32_t max_batch_size = 65535;
790    while (dispatched < query_count) {
791       uint32_t batch_size = MIN2(query_count - dispatched, max_batch_size);
792       v3dv_CmdDispatchBase(vk_cmd_buffer, dispatched, 0, 0, batch_size, 1, 1);
793       dispatched += batch_size;
794    }
795 }
796 
797 void
v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count,uint8_t availability)798 v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
799                                             struct v3dv_query_pool *pool,
800                                             uint32_t query, uint32_t count,
801                                             uint8_t availability)
802 {
803    assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
804           pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
805 
806    struct v3dv_device *device = cmd_buffer->device;
807    VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
808 
809    /* We are about to emit a compute job to set query availability and we need
810     * to ensure this executes after the graphics work using the queries has
811     * completed.
812     */
813    VkMemoryBarrier2 barrier = {
814       .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
815       .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
816       .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
817    };
818    VkDependencyInfo barrier_info = {
819       .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
820       .memoryBarrierCount = 1,
821       .pMemoryBarriers = &barrier,
822    };
823    v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
824 
825    /* Dispatch queries */
826    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
827 
828    v3dv_CmdBindPipeline(vk_cmd_buffer,
829                         VK_PIPELINE_BIND_POINT_COMPUTE,
830                         device->queries.avail_pipeline);
831 
832    v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
833                               VK_PIPELINE_BIND_POINT_COMPUTE,
834                               device->queries.avail_pipeline_layout,
835                               0, 1, &pool->meta.descriptor_set,
836                               0, NULL);
837 
838    struct {
839       uint32_t offset;
840       uint32_t query;
841       uint8_t availability;
842    } push_data = { pool->occlusion.avail_offset, query, availability };
843    v3dv_CmdPushConstants(vk_cmd_buffer,
844                          device->queries.avail_pipeline_layout,
845                          VK_SHADER_STAGE_COMPUTE_BIT,
846                          0, sizeof(push_data), &push_data);
847    cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
848 
849    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
850 }
851 
852 static void
cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count)853 cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
854                                            struct v3dv_query_pool *pool,
855                                            uint32_t query, uint32_t count)
856 {
857    struct v3dv_device *device = cmd_buffer->device;
858    VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
859 
860    /* Ensure the GPU is done with the queries in the graphics queue before
861     * we reset in the compute queue.
862     */
863    VkMemoryBarrier2 barrier = {
864       .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
865       .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
866       .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
867    };
868    VkDependencyInfo barrier_info = {
869       .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
870       .memoryBarrierCount = 1,
871       .pMemoryBarriers = &barrier,
872    };
873    v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
874 
875    /* Emit compute reset */
876    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
877 
878    v3dv_CmdBindPipeline(vk_cmd_buffer,
879                         VK_PIPELINE_BIND_POINT_COMPUTE,
880                         device->queries.reset_occlusion_pipeline);
881 
882    v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
883                               VK_PIPELINE_BIND_POINT_COMPUTE,
884                               device->queries.reset_occlusion_pipeline_layout,
885                               0, 1, &pool->meta.descriptor_set,
886                               0, NULL);
887    struct {
888       uint32_t offset;
889       uint32_t query;
890    } push_data = { pool->occlusion.avail_offset, query };
891    v3dv_CmdPushConstants(vk_cmd_buffer,
892                          device->queries.reset_occlusion_pipeline_layout,
893                          VK_SHADER_STAGE_COMPUTE_BIT,
894                          0, sizeof(push_data), &push_data);
895 
896    cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
897 
898    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
899 
900    /* Ensure future work in the graphics queue using the queries doesn't start
901     * before the reset completed.
902     */
903    barrier = (VkMemoryBarrier2) {
904       .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
905       .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
906       .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT,
907    };
908    barrier_info = (VkDependencyInfo) {
909       .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
910       .memoryBarrierCount = 1,
911       .pMemoryBarriers = &barrier,
912    };
913    v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
914 }
915 
916 static void
cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)917 cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
918                                  struct v3dv_query_pool *pool,
919                                  uint32_t first, uint32_t count)
920 {
921    assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
922    cmd_buffer_emit_reset_occlusion_query_pool(cmd_buffer, pool, first, count);
923 }
924 
925 static void
cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)926 cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer *cmd_buffer,
927                                      struct v3dv_query_pool *pool,
928                                      uint32_t first, uint32_t count)
929 {
930    assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION);
931 
932    struct v3dv_job *job =
933       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
934                                      V3DV_JOB_TYPE_CPU_RESET_QUERIES,
935                                      cmd_buffer, -1);
936    v3dv_return_if_oom(cmd_buffer, NULL);
937    job->cpu.query_reset.pool = pool;
938    job->cpu.query_reset.first = first;
939    job->cpu.query_reset.count = count;
940    list_addtail(&job->list_link, &cmd_buffer->jobs);
941 }
942 
943 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)944 v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
945                        VkQueryPool queryPool,
946                        uint32_t firstQuery,
947                        uint32_t queryCount)
948 {
949    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
950    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
951 
952    /* Resets can only happen outside a render pass instance so we should not
953     * be in the middle of job recording.
954     */
955    assert(cmd_buffer->state.pass == NULL);
956    assert(cmd_buffer->state.job == NULL);
957 
958    assert(firstQuery < pool->query_count);
959    assert(firstQuery + queryCount <= pool->query_count);
960 
961    /* We can reset occlusion queries in the GPU, but for other query types
962     * we emit a CPU job that will call v3dv_reset_query_pool_cpu when executed
963     * in the queue.
964     */
965    if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
966       cmd_buffer_emit_reset_query_pool(cmd_buffer, pool, firstQuery, queryCount);
967    } else {
968       cmd_buffer_emit_reset_query_pool_cpu(cmd_buffer, pool,
969                                            firstQuery, queryCount);
970    }
971 }
972 
973 /**
974  * Creates a descriptor pool so we can create a descriptors for the destination
975  * buffers of vkCmdCopyQueryResults for queries where this is implemented in
976  * the GPU.
977  */
978 static VkResult
create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)979 create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
980 {
981    /* If this is not the first pool we create one for this command buffer
982     * size it based on the size of the currently exhausted pool.
983     */
984    uint32_t descriptor_count = 32;
985    if (cmd_buffer->meta.query.dspool != VK_NULL_HANDLE) {
986       struct v3dv_descriptor_pool *exhausted_pool =
987          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.query.dspool);
988       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
989    }
990 
991    /* Create the descriptor pool */
992    cmd_buffer->meta.query.dspool = VK_NULL_HANDLE;
993    VkDescriptorPoolSize pool_size = {
994       .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
995       .descriptorCount = descriptor_count,
996    };
997    VkDescriptorPoolCreateInfo info = {
998       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
999       .maxSets = descriptor_count,
1000       .poolSizeCount = 1,
1001       .pPoolSizes = &pool_size,
1002       .flags = 0,
1003    };
1004    VkResult result =
1005       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
1006                                 &info,
1007                                 &cmd_buffer->device->vk.alloc,
1008                                 &cmd_buffer->meta.query.dspool);
1009 
1010    if (result == VK_SUCCESS) {
1011       assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
1012       const VkDescriptorPool vk_pool = cmd_buffer->meta.query.dspool;
1013 
1014       v3dv_cmd_buffer_add_private_obj(
1015          cmd_buffer, (uintptr_t) vk_pool,
1016          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
1017 
1018       struct v3dv_descriptor_pool *pool =
1019          v3dv_descriptor_pool_from_handle(vk_pool);
1020       pool->is_driver_internal = true;
1021    }
1022 
1023    return result;
1024 }
1025 
1026 static VkResult
allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)1027 allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
1028                                        VkDescriptorSet *set)
1029 {
1030    /* Make sure we have a descriptor pool */
1031    VkResult result;
1032    if (cmd_buffer->meta.query.dspool == VK_NULL_HANDLE) {
1033       result = create_storage_buffer_descriptor_pool(cmd_buffer);
1034       if (result != VK_SUCCESS)
1035          return result;
1036    }
1037    assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
1038 
1039    /* Allocate descriptor set */
1040    struct v3dv_device *device = cmd_buffer->device;
1041    VkDevice vk_device = v3dv_device_to_handle(device);
1042    VkDescriptorSetAllocateInfo info = {
1043       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
1044       .descriptorPool = cmd_buffer->meta.query.dspool,
1045       .descriptorSetCount = 1,
1046       .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1047    };
1048    result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
1049 
1050    /* If we ran out of pool space, grow the pool and try again */
1051    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
1052       result = create_storage_buffer_descriptor_pool(cmd_buffer);
1053       if (result == VK_SUCCESS) {
1054          info.descriptorPool = cmd_buffer->meta.query.dspool;
1055          result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
1056       }
1057    }
1058 
1059    return result;
1060 }
1061 
1062 static uint32_t
copy_pipeline_index_from_flags(VkQueryResultFlags flags)1063 copy_pipeline_index_from_flags(VkQueryResultFlags flags)
1064 {
1065    uint32_t index = 0;
1066    if (flags & VK_QUERY_RESULT_64_BIT)
1067       index |= 1;
1068    if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
1069       index |= 2;
1070    if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
1071       index |= 4;
1072    assert(index < 8);
1073    return index;
1074 }
1075 
1076 static nir_shader *
1077 get_copy_query_results_cs(VkQueryResultFlags flags);
1078 
1079 static void
cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * buf,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)1080 cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer *cmd_buffer,
1081                                         struct v3dv_query_pool *pool,
1082                                         uint32_t first, uint32_t count,
1083                                         struct v3dv_buffer *buf,
1084                                         uint32_t offset, uint32_t stride,
1085                                         VkQueryResultFlags flags)
1086 {
1087    struct v3dv_device *device = cmd_buffer->device;
1088    VkDevice vk_device = v3dv_device_to_handle(device);
1089    VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
1090 
1091    /* Create the required copy pipeline if not yet created */
1092    uint32_t pipeline_idx = copy_pipeline_index_from_flags(flags);
1093    if (!device->queries.copy_pipeline[pipeline_idx]) {
1094       nir_shader *copy_query_results_cs_nir = get_copy_query_results_cs(flags);
1095       VkResult result =
1096          v3dv_create_compute_pipeline_from_nir(
1097                device, copy_query_results_cs_nir,
1098                device->queries.copy_pipeline_layout,
1099                &device->queries.copy_pipeline[pipeline_idx]);
1100       ralloc_free(copy_query_results_cs_nir);
1101       if (result != VK_SUCCESS) {
1102          fprintf(stderr, "Failed to create copy query results pipeline\n");
1103          return;
1104       }
1105    }
1106 
1107    /* FIXME: do we need this barrier? Since vkCmdEndQuery should've been called
1108     * and that already waits maybe we don't (since this is serialized
1109     * in the compute queue with EndQuery anyway).
1110     */
1111    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1112       VkMemoryBarrier2 barrier = {
1113          .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
1114          .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
1115          .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1116       };
1117       VkDependencyInfo barrier_info = {
1118          .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1119          .memoryBarrierCount = 1,
1120          .pMemoryBarriers = &barrier,
1121       };
1122       v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
1123    }
1124 
1125    /* Allocate and setup descriptor set for output buffer */
1126    VkDescriptorSet out_buf_descriptor_set;
1127    VkResult result =
1128       allocate_storage_buffer_descriptor_set(cmd_buffer,
1129                                              &out_buf_descriptor_set);
1130    if (result != VK_SUCCESS) {
1131       fprintf(stderr, "vkCmdCopyQueryPoolResults failed: "
1132               "could not allocate descriptor.\n");
1133       return;
1134    }
1135 
1136    VkDescriptorBufferInfo desc_buf_info = {
1137       .buffer = v3dv_buffer_to_handle(buf),
1138       .offset = 0,
1139       .range = VK_WHOLE_SIZE,
1140    };
1141    VkWriteDescriptorSet write = {
1142       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1143       .dstSet = out_buf_descriptor_set,
1144       .dstBinding = 0,
1145       .dstArrayElement = 0,
1146       .descriptorCount = 1,
1147       .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1148       .pBufferInfo = &desc_buf_info,
1149    };
1150    v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
1151 
1152    /* Dispatch copy */
1153    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
1154 
1155    assert(device->queries.copy_pipeline[pipeline_idx]);
1156    v3dv_CmdBindPipeline(vk_cmd_buffer,
1157                         VK_PIPELINE_BIND_POINT_COMPUTE,
1158                         device->queries.copy_pipeline[pipeline_idx]);
1159 
1160    VkDescriptorSet sets[2] = {
1161       pool->meta.descriptor_set,
1162       out_buf_descriptor_set,
1163    };
1164    v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
1165                               VK_PIPELINE_BIND_POINT_COMPUTE,
1166                               device->queries.copy_pipeline_layout,
1167                               0, 2, sets, 0, NULL);
1168 
1169    struct {
1170       uint32_t avail_offset, first, offset, stride, flags;
1171    } push_data = { pool->occlusion.avail_offset, first, offset, stride, flags };
1172    v3dv_CmdPushConstants(vk_cmd_buffer,
1173                          device->queries.copy_pipeline_layout,
1174                          VK_SHADER_STAGE_COMPUTE_BIT,
1175                          0, sizeof(push_data), &push_data);
1176 
1177    cmd_buffer_emit_dispatch_queries(cmd_buffer, count);
1178 
1179    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
1180 }
1181 
1182 static void
cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t first,uint32_t count,struct v3dv_buffer * dst,uint32_t offset,uint32_t stride,VkQueryResultFlags flags)1183 cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer *cmd_buffer,
1184                                             struct v3dv_query_pool *pool,
1185                                             uint32_t first,
1186                                             uint32_t count,
1187                                             struct v3dv_buffer *dst,
1188                                             uint32_t offset,
1189                                             uint32_t stride,
1190                                             VkQueryResultFlags flags)
1191 {
1192    struct v3dv_job *job =
1193       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
1194                                      V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
1195                                      cmd_buffer, -1);
1196    v3dv_return_if_oom(cmd_buffer, NULL);
1197 
1198    job->cpu.query_copy_results.pool = pool;
1199    job->cpu.query_copy_results.first = first;
1200    job->cpu.query_copy_results.count = count;
1201    job->cpu.query_copy_results.dst = dst;
1202    job->cpu.query_copy_results.offset = offset;
1203    job->cpu.query_copy_results.stride = stride;
1204    job->cpu.query_copy_results.flags = flags;
1205 
1206    list_addtail(&job->list_link, &cmd_buffer->jobs);
1207 }
1208 
1209 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1210 v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
1211                              VkQueryPool queryPool,
1212                              uint32_t firstQuery,
1213                              uint32_t queryCount,
1214                              VkBuffer dstBuffer,
1215                              VkDeviceSize dstOffset,
1216                              VkDeviceSize stride,
1217                              VkQueryResultFlags flags)
1218 {
1219    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1220    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1221    V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);
1222 
1223    /* Copies can only happen outside a render pass instance so we should not
1224     * be in the middle of job recording.
1225     */
1226    assert(cmd_buffer->state.pass == NULL);
1227    assert(cmd_buffer->state.job == NULL);
1228 
1229    assert(firstQuery < pool->query_count);
1230    assert(firstQuery + queryCount <= pool->query_count);
1231 
1232    /* For occlusion queries we implement the copy in the GPU but for other
1233     * queries we emit a CPU job that will call v3dv_get_query_pool_results_cpu
1234     * when executed in the queue.
1235     */
1236    if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
1237       cmd_buffer_emit_copy_query_pool_results(cmd_buffer, pool,
1238                                               firstQuery, queryCount,
1239                                               dst, (uint32_t) dstOffset,
1240                                               (uint32_t) stride, flags);
1241    } else {
1242       cmd_buffer_emit_copy_query_pool_results_cpu(cmd_buffer, pool,
1243                                                   firstQuery, queryCount,
1244                                                   dst, (uint32_t)dstOffset,
1245                                                   (uint32_t) stride, flags);
1246    }
1247 }
1248 
1249 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1250 v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
1251                    VkQueryPool queryPool,
1252                    uint32_t query,
1253                    VkQueryControlFlags flags)
1254 {
1255    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1256    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1257 
1258    v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags);
1259 }
1260 
1261 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1262 v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
1263                  VkQueryPool queryPool,
1264                  uint32_t query)
1265 {
1266    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1267    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1268 
1269    v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
1270 }
1271 
1272 void
v3dv_reset_query_pool_cpu(struct v3dv_device * device,struct v3dv_query_pool * pool,uint32_t first,uint32_t count)1273 v3dv_reset_query_pool_cpu(struct v3dv_device *device,
1274                           struct v3dv_query_pool *pool,
1275                           uint32_t first,
1276                           uint32_t count)
1277 {
1278    mtx_lock(&device->query_mutex);
1279 
1280    if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
1281       assert(first + count <= pool->query_count);
1282 
1283       /* Reset timestamp */
1284       uint8_t *base_addr;
1285       base_addr  = ((uint8_t *) pool->timestamp.bo->map) +
1286                     pool->queries[first].timestamp.offset;
1287       memset(base_addr, 0, 8 * count);
1288 
1289       for (uint32_t i = first; i < first + count; i++) {
1290          if (vk_sync_reset(&device->vk, pool->queries[i].timestamp.sync) != VK_SUCCESS)
1291             fprintf(stderr, "Failed to reset sync");
1292       }
1293 
1294       mtx_unlock(&device->query_mutex);
1295       return;
1296    }
1297 
1298    for (uint32_t i = first; i < first + count; i++) {
1299       assert(i < pool->query_count);
1300       struct v3dv_query *q = &pool->queries[i];
1301       q->maybe_available = false;
1302       switch (pool->query_type) {
1303       case VK_QUERY_TYPE_OCCLUSION: {
1304          /* Reset availability */
1305          uint8_t *base_addr = ((uint8_t *) pool->occlusion.bo->map) +
1306                               pool->occlusion.avail_offset + first;
1307          memset(base_addr, 0, count);
1308 
1309          /* Reset occlusion counter */
1310          const uint8_t *q_addr =
1311             ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
1312          uint32_t *counter = (uint32_t *) q_addr;
1313          *counter = 0;
1314          break;
1315       }
1316       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1317          kperfmon_destroy(device, pool, i);
1318          kperfmon_create(device, pool, i);
1319          if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
1320             fprintf(stderr, "Failed to reset sync");
1321          break;
1322       default:
1323          unreachable("Unsupported query type");
1324       }
1325    }
1326 
1327    mtx_unlock(&device->query_mutex);
1328 }
1329 
1330 VKAPI_ATTR void VKAPI_CALL
v3dv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1331 v3dv_ResetQueryPool(VkDevice _device,
1332                     VkQueryPool queryPool,
1333                     uint32_t firstQuery,
1334                     uint32_t queryCount)
1335 {
1336    V3DV_FROM_HANDLE(v3dv_device, device, _device);
1337    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
1338 
1339    v3dv_reset_query_pool_cpu(device, pool, firstQuery, queryCount);
1340 }
1341 
1342 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1343 v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1344    VkPhysicalDevice physicalDevice,
1345    uint32_t queueFamilyIndex,
1346    uint32_t *pCounterCount,
1347    VkPerformanceCounterKHR *pCounters,
1348    VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
1349 {
1350    V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
1351 
1352    return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
1353                                                                 pCounters,
1354                                                                 pCounterDescriptions);
1355 }
1356 
1357 VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1358 v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1359    VkPhysicalDevice physicalDevice,
1360    const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
1361    uint32_t *pNumPasses)
1362 {
1363    *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
1364                               DRM_V3D_MAX_PERF_COUNTERS);
1365 }
1366 
1367 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AcquireProfilingLockKHR(VkDevice _device,const VkAcquireProfilingLockInfoKHR * pInfo)1368 v3dv_AcquireProfilingLockKHR(
1369    VkDevice _device,
1370    const VkAcquireProfilingLockInfoKHR *pInfo)
1371 {
1372    return VK_SUCCESS;
1373 }
1374 
1375 VKAPI_ATTR void VKAPI_CALL
v3dv_ReleaseProfilingLockKHR(VkDevice device)1376 v3dv_ReleaseProfilingLockKHR(VkDevice device)
1377 {
1378 }
1379 
1380 static inline void
nir_set_query_availability(nir_builder * b,nir_def * buf,nir_def * offset,nir_def * query_idx,nir_def * avail)1381 nir_set_query_availability(nir_builder *b,
1382                            nir_def *buf,
1383                            nir_def *offset,
1384                            nir_def *query_idx,
1385                            nir_def *avail)
1386 {
1387    offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
1388    nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1);
1389 }
1390 
1391 static inline nir_def *
nir_get_query_availability(nir_builder * b,nir_def * buf,nir_def * offset,nir_def * query_idx)1392 nir_get_query_availability(nir_builder *b,
1393                            nir_def *buf,
1394                            nir_def *offset,
1395                            nir_def *query_idx)
1396 {
1397    offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
1398    nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1);
1399    return nir_i2i32(b, avail);
1400 }
1401 
1402 static nir_shader *
get_set_query_availability_cs()1403 get_set_query_availability_cs()
1404 {
1405    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1406    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1407                                                   "set query availability cs");
1408 
1409    nir_def *buf =
1410       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1411                                 .desc_set = 0,
1412                                 .binding = 0,
1413                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1414 
1415    /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1416     * ever change any of these parameters we need to update how we compute the
1417     * query index here.
1418     */
1419    nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1420 
1421    nir_def *offset =
1422       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1423 
1424    nir_def *query_idx =
1425       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1426 
1427    nir_def *avail =
1428       nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1);
1429 
1430    query_idx = nir_iadd(&b, query_idx, wg_id);
1431    nir_set_query_availability(&b, buf, offset, query_idx, avail);
1432 
1433    return b.shader;
1434 }
1435 
1436 static inline nir_def *
nir_get_occlusion_counter_offset(nir_builder * b,nir_def * query_idx)1437 nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx)
1438 {
1439    nir_def *query_group = nir_udiv_imm(b, query_idx, 16);
1440    nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16);
1441    nir_def *offset =
1442       nir_iadd(b, nir_imul_imm(b, query_group, 1024),
1443                   nir_imul_imm(b, query_group_offset, 4));
1444    return offset;
1445 }
1446 
1447 static inline void
nir_reset_occlusion_counter(nir_builder * b,nir_def * buf,nir_def * query_idx)1448 nir_reset_occlusion_counter(nir_builder *b,
1449                             nir_def *buf,
1450                             nir_def *query_idx)
1451 {
1452    nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
1453    nir_def *zero = nir_imm_int(b, 0);
1454    nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4);
1455 }
1456 
1457 static inline nir_def *
nir_read_occlusion_counter(nir_builder * b,nir_def * buf,nir_def * query_idx)1458 nir_read_occlusion_counter(nir_builder *b,
1459                            nir_def *buf,
1460                            nir_def *query_idx)
1461 {
1462    nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
1463    return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4);
1464 }
1465 
1466 static nir_shader *
get_reset_occlusion_query_cs()1467 get_reset_occlusion_query_cs()
1468 {
1469    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1470    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1471                                                   "reset occlusion query cs");
1472 
1473    nir_def *buf =
1474       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1475                                 .desc_set = 0,
1476                                 .binding = 0,
1477                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1478 
1479    /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1480     * ever change any of these parameters we need to update how we compute the
1481     * query index here.
1482     */
1483    nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1484 
1485    nir_def *avail_offset =
1486       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1487 
1488    nir_def *base_query_idx =
1489       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1490 
1491    nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
1492 
1493    nir_set_query_availability(&b, buf, avail_offset, query_idx,
1494                               nir_imm_intN_t(&b, 0, 8));
1495    nir_reset_occlusion_counter(&b, buf, query_idx);
1496 
1497    return b.shader;
1498 }
1499 
1500 static void
write_query_buffer(nir_builder * b,nir_def * buf,nir_def ** offset,nir_def * value,bool flag_64bit)1501 write_query_buffer(nir_builder *b,
1502                    nir_def *buf,
1503                    nir_def **offset,
1504                    nir_def *value,
1505                    bool flag_64bit)
1506 {
1507    if (flag_64bit) {
1508       /* Create a 64-bit value using a vec2 with the .Y component set to 0
1509        * so we can write a 64-bit value in a single store.
1510        */
1511       nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0));
1512       nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8);
1513       *offset = nir_iadd_imm(b, *offset, 8);
1514    } else {
1515       nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4);
1516       *offset = nir_iadd_imm(b, *offset, 4);
1517    }
1518 }
1519 
1520 static nir_shader *
get_copy_query_results_cs(VkQueryResultFlags flags)1521 get_copy_query_results_cs(VkQueryResultFlags flags)
1522 {
1523    bool flag_64bit = flags & VK_QUERY_RESULT_64_BIT;
1524    bool flag_avail = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
1525    bool flag_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
1526 
1527    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1528    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
1529                                                   "copy query results cs");
1530 
1531    nir_def *buf =
1532       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1533                                 .desc_set = 0,
1534                                 .binding = 0,
1535                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1536 
1537    nir_def *buf_out =
1538       nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
1539                                 .desc_set = 1,
1540                                 .binding = 0,
1541                                 .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1542 
1543    /* Read push constants */
1544    nir_def *avail_offset =
1545       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);
1546 
1547    nir_def *base_query_idx =
1548       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);
1549 
1550    nir_def *base_offset_out =
1551       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4);
1552 
1553    nir_def *stride =
1554       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4);
1555 
1556    /* This assumes a local size of 1 and a horizontal-only dispatch. If we
1557     * ever change any of these parameters we need to update how we compute the
1558     * query index here.
1559     */
1560    nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
1561    nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);
1562 
1563    /* Read query availability if needed */
1564    nir_def *avail = NULL;
1565    if (flag_avail || !flag_partial)
1566       avail = nir_get_query_availability(&b, buf, avail_offset, query_idx);
1567 
1568    /* Write occusion query result... */
1569    nir_def *offset =
1570       nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride));
1571 
1572    /* ...if partial is requested, we always write */
1573    if(flag_partial) {
1574       nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
1575       write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
1576    } else {
1577       /*...otherwise, we only write if the query is available */
1578       nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0));
1579          nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
1580          write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
1581       nir_pop_if(&b, if_stmt);
1582    }
1583 
1584    /* Write query availability */
1585    if (flag_avail)
1586       write_query_buffer(&b, buf_out, &offset, avail, flag_64bit);
1587 
1588    return b.shader;
1589 }
1590 
1591 static bool
create_query_pipelines(struct v3dv_device * device)1592 create_query_pipelines(struct v3dv_device *device)
1593 {
1594    VkResult result;
1595    VkPipeline pipeline;
1596 
1597    /* Set layout: single storage buffer */
1598    if (!device->queries.buf_descriptor_set_layout) {
1599       VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
1600          .binding = 0,
1601          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1602          .descriptorCount = 1,
1603          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1604       };
1605       VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
1606          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
1607          .bindingCount = 1,
1608          .pBindings = &descriptor_set_layout_binding,
1609       };
1610       result =
1611          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
1612                                         &descriptor_set_layout_info,
1613                                         &device->vk.alloc,
1614                                         &device->queries.buf_descriptor_set_layout);
1615       if (result != VK_SUCCESS)
1616          return false;
1617    }
1618 
1619    /* Set availability pipeline.
1620     *
1621     * Pipeline layout:
1622     *  - 1 storage buffer for the BO with the query availability.
1623     *  - 2 push constants:
1624     *    0B: offset of the availability info in the buffer (4 bytes)
1625     *    4B: base query index (4 bytes).
1626     *    8B: availability (1 byte).
1627     */
1628    if (!device->queries.avail_pipeline_layout) {
1629       VkPipelineLayoutCreateInfo pipeline_layout_info = {
1630          .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1631          .setLayoutCount = 1,
1632          .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1633          .pushConstantRangeCount = 1,
1634          .pPushConstantRanges =
1635              &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 9 },
1636       };
1637 
1638       result =
1639          v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1640                                    &pipeline_layout_info,
1641                                    &device->vk.alloc,
1642                                    &device->queries.avail_pipeline_layout);
1643 
1644       if (result != VK_SUCCESS)
1645          return false;
1646    }
1647 
1648    if (!device->queries.avail_pipeline) {
1649       nir_shader *set_query_availability_cs_nir = get_set_query_availability_cs();
1650       result = v3dv_create_compute_pipeline_from_nir(device,
1651                                                      set_query_availability_cs_nir,
1652                                                      device->queries.avail_pipeline_layout,
1653                                                      &pipeline);
1654       ralloc_free(set_query_availability_cs_nir);
1655       if (result != VK_SUCCESS)
1656          return false;
1657 
1658       device->queries.avail_pipeline = pipeline;
1659    }
1660 
1661    /* Reset occlusion query pipeline.
1662     *
1663     * Pipeline layout:
1664     *  - 1 storage buffer for the BO with the occlusion and availability data.
1665     *  - Push constants:
1666     *    0B: offset of the availability info in the buffer (4B)
1667     *    4B: base query index (4B)
1668     */
1669    if (!device->queries.reset_occlusion_pipeline_layout) {
1670       VkPipelineLayoutCreateInfo pipeline_layout_info = {
1671          .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1672          .setLayoutCount = 1,
1673          .pSetLayouts = &device->queries.buf_descriptor_set_layout,
1674          .pushConstantRangeCount = 1,
1675          .pPushConstantRanges =
1676              &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 8 },
1677       };
1678 
1679       result =
1680          v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1681                                    &pipeline_layout_info,
1682                                    &device->vk.alloc,
1683                                    &device->queries.reset_occlusion_pipeline_layout);
1684 
1685       if (result != VK_SUCCESS)
1686          return false;
1687    }
1688 
1689    if (!device->queries.reset_occlusion_pipeline) {
1690       nir_shader *reset_occlusion_query_cs_nir = get_reset_occlusion_query_cs();
1691       result = v3dv_create_compute_pipeline_from_nir(
1692                   device,
1693                   reset_occlusion_query_cs_nir,
1694                   device->queries.reset_occlusion_pipeline_layout,
1695                   &pipeline);
1696       ralloc_free(reset_occlusion_query_cs_nir);
1697       if (result != VK_SUCCESS)
1698          return false;
1699 
1700       device->queries.reset_occlusion_pipeline = pipeline;
1701    }
1702 
1703    /* Copy query results pipelines.
1704     *
1705     * Pipeline layout:
1706     *  - 1 storage buffer for the BO with the query availability and occlusion.
1707     *  - 1 storage buffer for the output.
1708     *  - Push constants:
1709     *    0B: offset of the availability info in the buffer (4B)
1710     *    4B: base query index (4B)
1711     *    8B: offset into output buffer (4B)
1712     *    12B: stride (4B)
1713     *
1714     * We create multiple specialized pipelines depending on the copy flags
1715     * to remove conditionals from the copy shader and get more optimized
1716     * pipelines.
1717     */
1718    if (!device->queries.copy_pipeline_layout) {
1719       VkDescriptorSetLayout set_layouts[2] = {
1720          device->queries.buf_descriptor_set_layout,
1721          device->queries.buf_descriptor_set_layout
1722       };
1723       VkPipelineLayoutCreateInfo pipeline_layout_info = {
1724          .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1725          .setLayoutCount = 2,
1726          .pSetLayouts = set_layouts,
1727          .pushConstantRangeCount = 1,
1728          .pPushConstantRanges =
1729              &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 16 },
1730       };
1731 
1732       result =
1733          v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
1734                                    &pipeline_layout_info,
1735                                    &device->vk.alloc,
1736                                    &device->queries.copy_pipeline_layout);
1737 
1738       if (result != VK_SUCCESS)
1739          return false;
1740    }
1741 
1742    /* Actual copy pipelines are created lazily on demand since there can be up
1743     * to 8 depending on the flags used, however it is likely that applications
1744     * will use the same flags every time and only one pipeline is required.
1745     */
1746 
1747    return true;
1748 }
1749 
1750 static void
destroy_query_pipelines(struct v3dv_device * device)1751 destroy_query_pipelines(struct v3dv_device *device)
1752 {
1753    VkDevice _device = v3dv_device_to_handle(device);
1754 
1755    /* Availability pipeline */
1756    v3dv_DestroyPipeline(_device, device->queries.avail_pipeline,
1757                          &device->vk.alloc);
1758    device->queries.avail_pipeline = VK_NULL_HANDLE;
1759    v3dv_DestroyPipelineLayout(_device, device->queries.avail_pipeline_layout,
1760                               &device->vk.alloc);
1761    device->queries.avail_pipeline_layout = VK_NULL_HANDLE;
1762 
1763    /* Reset occlusion pipeline */
1764    v3dv_DestroyPipeline(_device, device->queries.reset_occlusion_pipeline,
1765                          &device->vk.alloc);
1766    device->queries.reset_occlusion_pipeline = VK_NULL_HANDLE;
1767    v3dv_DestroyPipelineLayout(_device,
1768                               device->queries.reset_occlusion_pipeline_layout,
1769                               &device->vk.alloc);
1770    device->queries.reset_occlusion_pipeline_layout = VK_NULL_HANDLE;
1771 
1772    /* Copy pipelines */
1773    for (int i = 0; i < 8; i++) {
1774       v3dv_DestroyPipeline(_device, device->queries.copy_pipeline[i],
1775                             &device->vk.alloc);
1776       device->queries.copy_pipeline[i] = VK_NULL_HANDLE;
1777    }
1778    v3dv_DestroyPipelineLayout(_device, device->queries.copy_pipeline_layout,
1779                               &device->vk.alloc);
1780    device->queries.copy_pipeline_layout = VK_NULL_HANDLE;
1781 
1782    v3dv_DestroyDescriptorSetLayout(_device,
1783                                    device->queries.buf_descriptor_set_layout,
1784                                    &device->vk.alloc);
1785    device->queries.buf_descriptor_set_layout = VK_NULL_HANDLE;
1786 }
1787 
1788 /**
1789  * Allocates device resources for implementing certain types of queries.
1790  */
1791 VkResult
v3dv_query_allocate_resources(struct v3dv_device * device)1792 v3dv_query_allocate_resources(struct v3dv_device *device)
1793 {
1794    if (!create_query_pipelines(device))
1795       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1796 
1797    return VK_SUCCESS;
1798 }
1799 
1800 void
v3dv_query_free_resources(struct v3dv_device * device)1801 v3dv_query_free_resources(struct v3dv_device *device)
1802 {
1803    destroy_query_pipelines(device);
1804 }
1805