• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include "hk_query_pool.h"
8 
9 #include "agx_compile.h"
10 #include "agx_pack.h"
11 #include "hk_buffer.h"
12 #include "hk_cmd_buffer.h"
13 #include "hk_device.h"
14 #include "hk_entrypoints.h"
15 #include "hk_shader.h"
16 
17 #include "libagx_shaders.h"
18 #include "vk_common_entrypoints.h"
19 
20 #include "asahi/lib/agx_bo.h"
21 #include "asahi/libagx/libagx.h"
22 #include "asahi/libagx/query.h"
23 #include "compiler/nir/nir.h"
24 #include "compiler/nir/nir_builder.h"
25 
26 #include "util/os_time.h"
27 #include "util/u_dynarray.h"
28 #include "vulkan/vulkan_core.h"
29 
30 struct hk_query_report {
31    uint64_t value;
32 };
33 
34 static inline bool
hk_has_available(const struct hk_query_pool * pool)35 hk_has_available(const struct hk_query_pool *pool)
36 {
37    return pool->vk.query_type != VK_QUERY_TYPE_TIMESTAMP;
38 }
39 
40 static uint16_t *
hk_pool_oq_index_ptr(const struct hk_query_pool * pool)41 hk_pool_oq_index_ptr(const struct hk_query_pool *pool)
42 {
43    return agx_bo_map(pool->bo) + pool->query_start;
44 }
45 
46 static uint32_t
hk_reports_per_query(struct hk_query_pool * pool)47 hk_reports_per_query(struct hk_query_pool *pool)
48 {
49    switch (pool->vk.query_type) {
50    case VK_QUERY_TYPE_OCCLUSION:
51    case VK_QUERY_TYPE_TIMESTAMP:
52    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
53       return 1;
54    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
55       return util_bitcount(pool->vk.pipeline_statistics);
56    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
57       // Primitives succeeded and primitives needed
58       return 2;
59    default:
60       unreachable("Unsupported query type");
61    }
62 }
63 
64 static void
hk_flush_if_timestamp(struct hk_cmd_buffer * cmd,struct hk_query_pool * pool)65 hk_flush_if_timestamp(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool)
66 {
67    struct hk_device *dev = hk_cmd_buffer_device(cmd);
68 
69    /* There might not be a barrier between the timestamp write and the copy
70     * otherwise but we need one to give the CPU a chance to write the timestamp.
71     * This could maybe optimized.
72     */
73    if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) {
74       perf_debug(dev, "Flushing for timestamp copy");
75       hk_cmd_buffer_end_graphics(cmd);
76       hk_cmd_buffer_end_compute(cmd);
77    }
78 }
79 
80 VKAPI_ATTR VkResult VKAPI_CALL
hk_CreateQueryPool(VkDevice device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)81 hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
82                    const VkAllocationCallbacks *pAllocator,
83                    VkQueryPool *pQueryPool)
84 {
85    VK_FROM_HANDLE(hk_device, dev, device);
86    struct hk_query_pool *pool;
87 
88    bool occlusion = pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION;
89    bool timestamp = pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP;
90    unsigned occlusion_queries = occlusion ? pCreateInfo->queryCount : 0;
91 
92    /* Workaround for DXVK on old kernels */
93    if (!agx_supports_timestamps(&dev->dev))
94       timestamp = false;
95 
96    pool =
97       vk_query_pool_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*pool));
98    if (!pool)
99       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
100 
101    /* We place the availability first and then data */
102    pool->query_start = 0;
103    if (hk_has_available(pool)) {
104       pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
105                                 sizeof(struct hk_query_report));
106    }
107 
108    uint32_t reports_per_query = hk_reports_per_query(pool);
109    pool->query_stride = reports_per_query * sizeof(struct hk_query_report);
110 
111    if (pool->vk.query_count > 0) {
112       uint32_t bo_size = pool->query_start;
113 
114       /* For occlusion queries, we stick the query index remapping here */
115       if (occlusion_queries)
116          bo_size += sizeof(uint16_t) * pool->vk.query_count;
117       else
118          bo_size += pool->query_stride * pool->vk.query_count;
119 
120       /* The kernel requires that timestamp buffers are SHARED */
121       enum agx_bo_flags flags = AGX_BO_WRITEBACK;
122       if (timestamp)
123          flags |= AGX_BO_SHARED;
124 
125       pool->bo = agx_bo_create(&dev->dev, bo_size, 0, flags, "Query pool");
126       if (!pool->bo) {
127          hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
128          return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
129       }
130 
131       /* Timestamp buffers must be explicitly bound as such before we can use
132        * them.
133        */
134       if (timestamp) {
135          int ret = dev->dev.ops.bo_bind_object(
136             &dev->dev, pool->bo, &pool->handle, pool->bo->size, 0,
137             ASAHI_BIND_OBJECT_USAGE_TIMESTAMPS);
138 
139          if (ret) {
140             hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool),
141                                 pAllocator);
142             return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
143          }
144 
145          assert(pool->handle && "handles are nonzero");
146       }
147    }
148 
149    uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
150 
151    for (unsigned i = 0; i < occlusion_queries; ++i) {
152       uint64_t zero = 0;
153       unsigned index;
154 
155       VkResult result = hk_descriptor_table_add(
156          dev, &dev->occlusion_queries, &zero, sizeof(uint64_t), &index);
157 
158       if (result != VK_SUCCESS) {
159          hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
160          return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
161       }
162 
163       /* We increment as we go so we can clean up properly if we run out */
164       assert(pool->oq_queries < occlusion_queries);
165       oq_index[pool->oq_queries++] = index;
166    }
167 
168    *pQueryPool = hk_query_pool_to_handle(pool);
169 
170    return VK_SUCCESS;
171 }
172 
173 VKAPI_ATTR void VKAPI_CALL
hk_DestroyQueryPool(VkDevice device,VkQueryPool queryPool,const VkAllocationCallbacks * pAllocator)174 hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool,
175                     const VkAllocationCallbacks *pAllocator)
176 {
177    VK_FROM_HANDLE(hk_device, dev, device);
178    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
179 
180    if (!pool)
181       return;
182 
183    uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
184 
185    for (unsigned i = 0; i < pool->oq_queries; ++i) {
186       hk_descriptor_table_remove(dev, &dev->occlusion_queries, oq_index[i]);
187    }
188 
189    if (pool->handle)
190       dev->dev.ops.bo_unbind_object(&dev->dev, pool->handle, 0);
191 
192    agx_bo_unreference(&dev->dev, pool->bo);
193    vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
194 }
195 
196 static uint64_t
hk_query_available_addr(struct hk_query_pool * pool,uint32_t query)197 hk_query_available_addr(struct hk_query_pool *pool, uint32_t query)
198 {
199    assert(hk_has_available(pool));
200    assert(query < pool->vk.query_count);
201    return pool->bo->va->addr + query * sizeof(uint32_t);
202 }
203 
204 static uint32_t *
hk_query_available_map(struct hk_query_pool * pool,uint32_t query)205 hk_query_available_map(struct hk_query_pool *pool, uint32_t query)
206 {
207    assert(hk_has_available(pool));
208    assert(query < pool->vk.query_count);
209    return (uint32_t *)agx_bo_map(pool->bo) + query;
210 }
211 
212 static uint64_t
hk_query_offset(struct hk_query_pool * pool,uint32_t query)213 hk_query_offset(struct hk_query_pool *pool, uint32_t query)
214 {
215    assert(query < pool->vk.query_count);
216    return pool->query_start + query * pool->query_stride;
217 }
218 
219 static uint64_t
hk_query_report_addr(struct hk_device * dev,struct hk_query_pool * pool,uint32_t query)220 hk_query_report_addr(struct hk_device *dev, struct hk_query_pool *pool,
221                      uint32_t query)
222 {
223    if (pool->oq_queries) {
224       uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
225       return dev->occlusion_queries.bo->va->addr +
226              (oq_index[query] * sizeof(uint64_t));
227    } else {
228       return pool->bo->va->addr + hk_query_offset(pool, query);
229    }
230 }
231 
232 static struct hk_query_report *
hk_query_report_map(struct hk_device * dev,struct hk_query_pool * pool,uint32_t query)233 hk_query_report_map(struct hk_device *dev, struct hk_query_pool *pool,
234                     uint32_t query)
235 {
236    if (pool->oq_queries) {
237       uint64_t *queries = (uint64_t *)agx_bo_map(dev->occlusion_queries.bo);
238       uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
239 
240       return (struct hk_query_report *)&queries[oq_index[query]];
241    } else {
242       return (void *)((char *)agx_bo_map(pool->bo) +
243                       hk_query_offset(pool, query));
244    }
245 }
246 
247 void
hk_dispatch_imm_writes(struct hk_cmd_buffer * cmd,struct hk_cs * cs)248 hk_dispatch_imm_writes(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
249 {
250    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
251 
252    /* As soon as we mark a query available, it needs to be available system
253     * wide, otherwise a CPU-side get result can query. As such, we cache flush
254     * before and then let coherency works its magic. Without this barrier, we
255     * get flakes in
256     *
257     * dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard
258     */
259    struct hk_device *dev = hk_cmd_buffer_device(cmd);
260    hk_cdm_cache_flush(dev, cs);
261 
262    perf_debug(dev, "Queued writes");
263 
264    uint64_t params =
265       hk_pool_upload(cmd, cs->imm_writes.data, cs->imm_writes.size, 16);
266 
267    uint32_t count =
268       util_dynarray_num_elements(&cs->imm_writes, struct libagx_imm_write);
269    assert(count > 0);
270 
271    libagx_write_u32s(cs, agx_1d(count), AGX_BARRIER_ALL, params);
272 }
273 
274 void
hk_queue_write(struct hk_cmd_buffer * cmd,uint64_t address,uint32_t value,bool after_gfx)275 hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
276                bool after_gfx)
277 {
278    struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
279       cmd, after_gfx ? &cmd->current_cs.post_gfx : &cmd->current_cs.cs, true);
280    if (!cs)
281       return;
282 
283    /* TODO: Generalize this mechanism suitably */
284    if (after_gfx) {
285       struct libagx_imm_write imm = {.address = address, .value = value};
286 
287       if (!cs->imm_writes.data) {
288          util_dynarray_init(&cs->imm_writes, NULL);
289       }
290 
291       util_dynarray_append(&cs->imm_writes, struct libagx_imm_write, imm);
292       return;
293    }
294 
295    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
296 
297    /* As soon as we mark a query available, it needs to be available system
298     * wide, otherwise a CPU-side get result can query. As such, we cache flush
299     * before and then let coherency works its magic. Without this barrier, we
300     * get flakes in
301     *
302     * dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard
303     */
304    struct hk_device *dev = hk_cmd_buffer_device(cmd);
305    hk_cdm_cache_flush(dev, cs);
306 
307    perf_debug(dev, "Queued write");
308    libagx_write_u32(cs, agx_1d(1), AGX_BARRIER_ALL, address, value);
309 }
310 
311 /**
312  * Goes through a series of consecutive query indices in the given pool,
313  * setting all element values to 0 and emitting them as available.
314  */
315 static void
emit_zero_queries(struct hk_cmd_buffer * cmd,struct hk_query_pool * pool,uint32_t first_index,uint32_t num_queries,bool set_available)316 emit_zero_queries(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
317                   uint32_t first_index, uint32_t num_queries,
318                   bool set_available)
319 {
320    struct hk_device *dev = hk_cmd_buffer_device(cmd);
321 
322    for (uint32_t i = 0; i < num_queries; i++) {
323       uint64_t report = hk_query_report_addr(dev, pool, first_index + i);
324 
325       uint64_t value = 0;
326       if (hk_has_available(pool)) {
327          uint64_t available = hk_query_available_addr(pool, first_index + i);
328          hk_queue_write(cmd, available, set_available, false);
329       } else {
330          value = set_available ? 0 : LIBAGX_QUERY_UNAVAILABLE;
331       }
332 
333       /* XXX: is this supposed to happen on the begin? */
334       for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
335          hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)),
336                         value, false);
337          hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)) + 4,
338                         value >> 32, false);
339       }
340    }
341 }
342 
343 static void
host_zero_queries(struct hk_device * dev,struct hk_query_pool * pool,uint32_t first_index,uint32_t num_queries,bool set_available)344 host_zero_queries(struct hk_device *dev, struct hk_query_pool *pool,
345                   uint32_t first_index, uint32_t num_queries,
346                   bool set_available)
347 {
348    for (uint32_t i = 0; i < num_queries; i++) {
349       struct hk_query_report *reports =
350          hk_query_report_map(dev, pool, first_index + i);
351 
352       uint64_t value = 0;
353       if (hk_has_available(pool)) {
354          uint32_t *available = hk_query_available_map(pool, first_index + i);
355          *available = set_available;
356       } else {
357          value = set_available ? 0 : LIBAGX_QUERY_UNAVAILABLE;
358       }
359 
360       for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
361          reports[j].value = value;
362       }
363    }
364 }
365 
366 VKAPI_ATTR void VKAPI_CALL
hk_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)367 hk_ResetQueryPool(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery,
368                   uint32_t queryCount)
369 {
370    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
371    VK_FROM_HANDLE(hk_device, dev, device);
372 
373    host_zero_queries(dev, pool, firstQuery, queryCount, false);
374 }
375 
376 VKAPI_ATTR void VKAPI_CALL
hk_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)377 hk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
378                      uint32_t firstQuery, uint32_t queryCount)
379 {
380    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
381    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
382    struct hk_device *dev = hk_cmd_buffer_device(cmd);
383 
384    hk_flush_if_timestamp(cmd, pool);
385 
386    perf_debug(dev, "Reset query pool");
387    emit_zero_queries(cmd, pool, firstQuery, queryCount, false);
388 }
389 
390 VKAPI_ATTR void VKAPI_CALL
hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)391 hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
392                       VkPipelineStageFlags2 stage, VkQueryPool queryPool,
393                       uint32_t query)
394 {
395    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
396    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
397    struct hk_device *dev = hk_cmd_buffer_device(cmd);
398 
399    /* Workaround for DXVK on old kernels */
400    if (!agx_supports_timestamps(&dev->dev))
401       return;
402 
403    uint64_t report_addr = hk_query_report_addr(dev, pool, query);
404 
405    bool after_gfx = cmd->current_cs.gfx != NULL;
406 
407    /* When writing timestamps for compute, we split the control stream at each
408     * write. This ensures we never need to copy compute timestamps, which would
409     * require an extra control stream anyway. Unlike graphics, splitting compute
410     * control streams is inexpensive so there's not a strong performance reason
411     * to do otherwise. Finally, batching multiple timestamp writes (like we do
412     * for graphics) would destroy the ability to profile individual compute
413     * dispatches. While that's allowed by the Vulkan spec, it's pretty mean to
414     * apps. So.. don't do that.
415     */
416    if (!after_gfx && cmd->current_cs.cs &&
417        cmd->current_cs.cs->timestamp.end.addr) {
418 
419       perf_debug(dev, "Splitting for compute timestamp");
420       hk_cmd_buffer_end_compute(cmd);
421    }
422 
423    struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
424       cmd, after_gfx ? &cmd->current_cs.gfx : &cmd->current_cs.cs, true);
425    if (!cs)
426       return;
427 
428    if (cs->timestamp.end.addr) {
429       assert(after_gfx && "compute is handled above");
430 
431       struct hk_cs *after =
432          hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.post_gfx, true);
433       if (!after)
434          return;
435 
436       libagx_copy_timestamp(after, agx_1d(1), AGX_BARRIER_ALL, report_addr,
437                             cs->timestamp.end.addr);
438    } else {
439       cs->timestamp.end = (struct agx_timestamp_req){
440          .addr = report_addr,
441          .handle = pool->handle,
442          .offset_B = hk_query_offset(pool, query),
443       };
444    }
445 
446    /* From the Vulkan spec:
447     *
448     *   "If vkCmdWriteTimestamp2 is called while executing a render pass
449     *    instance that has multiview enabled, the timestamp uses N consecutive
450     *    query indices in the query pool (starting at query) where N is the
451     *    number of bits set in the view mask of the subpass the command is
452     *    executed in. The resulting query values are determined by an
453     *    implementation-dependent choice of one of the following behaviors:"
454     *
455     * In our case, only the first query is used, so we emit zeros for the
456     * remaining queries, as described in the first behavior listed in the
457     * Vulkan spec:
458     *
459     *   "The first query is a timestamp value and (if more than one bit is set
460     *   in the view mask) zero is written to the remaining queries."
461     */
462    if (cmd->state.gfx.render.view_mask != 0) {
463       const uint32_t num_queries =
464          util_bitcount(cmd->state.gfx.render.view_mask);
465       if (num_queries > 1)
466          emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
467    }
468 }
469 
470 static void
hk_cmd_begin_end_query(struct hk_cmd_buffer * cmd,struct hk_query_pool * pool,uint32_t query,uint32_t index,VkQueryControlFlags flags,bool end)471 hk_cmd_begin_end_query(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
472                        uint32_t query, uint32_t index,
473                        VkQueryControlFlags flags, bool end)
474 {
475    struct hk_device *dev = hk_cmd_buffer_device(cmd);
476    bool graphics = false;
477 
478    switch (pool->vk.query_type) {
479    case VK_QUERY_TYPE_OCCLUSION: {
480       assert(query < pool->oq_queries);
481 
482       if (end) {
483          cmd->state.gfx.occlusion.mode = AGX_VISIBILITY_MODE_NONE;
484       } else {
485          cmd->state.gfx.occlusion.mode = flags & VK_QUERY_CONTROL_PRECISE_BIT
486                                             ? AGX_VISIBILITY_MODE_COUNTING
487                                             : AGX_VISIBILITY_MODE_BOOLEAN;
488       }
489 
490       uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
491       cmd->state.gfx.occlusion.index = oq_index[query];
492       cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION;
493       graphics = true;
494       break;
495    }
496 
497    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
498       uint64_t addr = hk_query_report_addr(dev, pool, query);
499       cmd->state.gfx.xfb_query[index] = end ? 0 : addr;
500       break;
501    }
502 
503    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
504       struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root;
505       cmd->state.gfx.descriptors.root_dirty = true;
506 
507       root->draw.pipeline_stats = hk_query_report_addr(dev, pool, query);
508       root->draw.pipeline_stats_flags = pool->vk.pipeline_statistics;
509 
510       /* XXX: I don't think is correct... when does the query become available
511        * exactly?
512        */
513       graphics = pool->vk.pipeline_statistics &
514                  ~VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
515       break;
516    }
517 
518    default:
519       unreachable("Unsupported query type");
520    }
521 
522    /* We need to set available=1 after the graphics work finishes. */
523    if (end) {
524       perf_debug(dev, "Query ending, type %u", pool->vk.query_type);
525       hk_queue_write(cmd, hk_query_available_addr(pool, query), 1, graphics);
526    }
527 }
528 
529 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)530 hk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
531                            uint32_t query, VkQueryControlFlags flags,
532                            uint32_t index)
533 {
534    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
535    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
536 
537    hk_cmd_begin_end_query(cmd, pool, query, index, flags, false);
538 }
539 
540 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)541 hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
542                          uint32_t query, uint32_t index)
543 {
544    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
545    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
546    struct hk_device *dev = hk_cmd_buffer_device(cmd);
547 
548    hk_cmd_begin_end_query(cmd, pool, query, index, 0, true);
549 
550    /* From the Vulkan spec:
551     *
552     *   "If queries are used while executing a render pass instance that has
553     *    multiview enabled, the query uses N consecutive query indices in
554     *    the query pool (starting at query) where N is the number of bits set
555     *    in the view mask in the subpass the query is used in. How the
556     *    numerical results of the query are distributed among the queries is
557     *    implementation-dependent."
558     *
559     * In our case, only the first query is used, so we emit zeros for the
560     * remaining queries.
561     */
562    if (cmd->state.gfx.render.view_mask != 0) {
563       const uint32_t num_queries =
564          util_bitcount(cmd->state.gfx.render.view_mask);
565       if (num_queries > 1) {
566          perf_debug(dev, "Multiview query zeroing");
567          emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
568       }
569    }
570 }
571 
572 static bool
hk_query_is_available(struct hk_device * dev,struct hk_query_pool * pool,uint32_t query)573 hk_query_is_available(struct hk_device *dev, struct hk_query_pool *pool,
574                       uint32_t query)
575 {
576    if (hk_has_available(pool)) {
577       uint32_t *available = hk_query_available_map(pool, query);
578       return p_atomic_read(available) != 0;
579    } else {
580       const struct hk_query_report *report =
581          hk_query_report_map(dev, pool, query);
582 
583       return report->value != LIBAGX_QUERY_UNAVAILABLE;
584    }
585 }
586 
587 #define HK_QUERY_TIMEOUT 2000000000ull
588 
589 static VkResult
hk_query_wait_for_available(struct hk_device * dev,struct hk_query_pool * pool,uint32_t query)590 hk_query_wait_for_available(struct hk_device *dev, struct hk_query_pool *pool,
591                             uint32_t query)
592 {
593    uint64_t abs_timeout_ns = os_time_get_absolute_timeout(HK_QUERY_TIMEOUT);
594 
595    while (os_time_get_nano() < abs_timeout_ns) {
596       if (hk_query_is_available(dev, pool, query))
597          return VK_SUCCESS;
598 
599       VkResult status = vk_device_check_status(&dev->vk);
600       if (status != VK_SUCCESS)
601          return status;
602    }
603 
604    return vk_device_set_lost(&dev->vk, "query timeout");
605 }
606 
607 static void
cpu_write_query_result(void * dst,uint32_t idx,VkQueryResultFlags flags,uint64_t result)608 cpu_write_query_result(void *dst, uint32_t idx, VkQueryResultFlags flags,
609                        uint64_t result)
610 {
611    if (flags & VK_QUERY_RESULT_64_BIT) {
612       uint64_t *dst64 = dst;
613       dst64[idx] = result;
614    } else {
615       uint32_t *dst32 = dst;
616       dst32[idx] = result;
617    }
618 }
619 
620 VKAPI_ATTR VkResult VKAPI_CALL
hk_GetQueryPoolResults(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)621 hk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool,
622                        uint32_t firstQuery, uint32_t queryCount,
623                        size_t dataSize, void *pData, VkDeviceSize stride,
624                        VkQueryResultFlags flags)
625 {
626    VK_FROM_HANDLE(hk_device, dev, device);
627    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
628 
629    if (vk_device_is_lost(&dev->vk))
630       return VK_ERROR_DEVICE_LOST;
631 
632    VkResult status = VK_SUCCESS;
633    for (uint32_t i = 0; i < queryCount; i++) {
634       const uint32_t query = firstQuery + i;
635 
636       bool available = hk_query_is_available(dev, pool, query);
637 
638       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
639          status = hk_query_wait_for_available(dev, pool, query);
640          if (status != VK_SUCCESS)
641             return status;
642 
643          available = true;
644       }
645 
646       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
647 
648       const struct hk_query_report *src = hk_query_report_map(dev, pool, query);
649       assert(i * stride < dataSize);
650       void *dst = (char *)pData + i * stride;
651 
652       uint32_t reports = hk_reports_per_query(pool);
653       if (write_results) {
654          for (uint32_t j = 0; j < reports; j++) {
655             cpu_write_query_result(dst, j, flags, src[j].value);
656          }
657       }
658 
659       if (!write_results)
660          status = VK_NOT_READY;
661 
662       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
663          cpu_write_query_result(dst, reports, flags, available);
664    }
665 
666    return status;
667 }
668 
669 VKAPI_ATTR void VKAPI_CALL
hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)670 hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
671                            uint32_t firstQuery, uint32_t queryCount,
672                            VkBuffer dstBuffer, VkDeviceSize dstOffset,
673                            VkDeviceSize stride, VkQueryResultFlags flags)
674 {
675    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
676    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
677    VK_FROM_HANDLE(hk_buffer, dst_buffer, dstBuffer);
678 
679    struct hk_device *dev = hk_cmd_buffer_device(cmd);
680    hk_flush_if_timestamp(cmd, pool);
681 
682    struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true);
683    if (!cs)
684       return;
685 
686    perf_debug(dev, "Query pool copy");
687    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
688 
689    struct libagx_copy_query_args info = {
690       .availability = hk_has_available(pool) ? pool->bo->va->addr : 0,
691       .results = pool->oq_queries ? dev->occlusion_queries.bo->va->addr
692                                   : pool->bo->va->addr + pool->query_start,
693       .oq_index = pool->oq_queries ? pool->bo->va->addr + pool->query_start : 0,
694 
695       .first_query = firstQuery,
696       .dst_addr = hk_buffer_address(dst_buffer, dstOffset),
697       .dst_stride = stride,
698       .reports_per_query = hk_reports_per_query(pool),
699 
700       .partial = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT,
701       ._64 = flags & VK_QUERY_RESULT_64_BIT,
702       .with_availability = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT,
703    };
704 
705    libagx_copy_query_struct(cs, agx_1d(queryCount), AGX_BARRIER_ALL, info);
706 }
707