• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 #include <stdint.h>
6 #include "util/os_time.h"
7 
8 #include "vk_log.h"
9 #include "vk_synchronization.h"
10 
11 #include "genxml/gen_macros.h"
12 
13 #include "panvk_buffer.h"
14 #include "panvk_cmd_buffer.h"
15 #include "panvk_cmd_meta.h"
16 #include "panvk_device.h"
17 #include "panvk_entrypoints.h"
18 #include "panvk_macros.h"
19 #include "panvk_query_pool.h"
20 
21 /* At the API level, a query consists of a status and a result.  Both are
22  * uninitialized initially.  There are these query operations:
23  *
24  *  - Reset op sets the status to unavailable and leaves the result undefined.
25  *  - Begin/End pair or Write op sets the status to available and the result
26  *    to the final query value.  Because of VK_QUERY_RESULT_PARTIAL_BIT, the
27  *    result must hold valid intermediate query values while the query is
28  *    active.
29  *  - Copy op copies the result and optionally the status to a buffer.
30  *
31  * All query operations define execution dependencies among themselves when
32  * they reference the same queries.  The only exception is the Copy op when
33  * VK_QUERY_RESULT_WAIT_BIT is not set.
34  *
35  * We use a panvk_cs_sync32 to store the status of a query:
36  *
37  *  - Reset op waits on all prior query operations affecting the query before
38  *    setting the seqno to 0 synchronously.
39  *  - Begin op does not access the seqno.
40  *  - End or Write op sets the seqno to 1 asynchronously.
41  *  - Copy op waits on the seqno only when VK_QUERY_RESULT_WAIT_BIT is set.
42  *
43  * Because Reset op acts as a full barrier, End or Write op knows the seqno is
44  * 0 and does not need to wait.
45  */
46 
47 static void
reset_oq_batch(struct cs_builder * b,struct cs_index addr,struct cs_index zero_regs,uint32_t query_count)48 reset_oq_batch(struct cs_builder *b, struct cs_index addr,
49                struct cs_index zero_regs, uint32_t query_count)
50 {
51    const uint32_t regs_per_query = 2;
52    const uint32_t queries_per_batch = zero_regs.size / regs_per_query;
53    uint32_t remaining_queries = query_count;
54 
55    assert(zero_regs.size > 2 && ALIGN_POT(zero_regs.size, 2) == zero_regs.size);
56 
57    if (query_count > queries_per_batch * 4) {
58       struct cs_index counter = cs_reg32(b, zero_regs.reg + zero_regs.size - 1);
59       struct cs_index new_zero_regs =
60          cs_reg_tuple(b, zero_regs.reg, zero_regs.size - 2);
61       const uint32_t adjusted_queries_per_batch =
62          new_zero_regs.size / regs_per_query;
63       uint32_t full_batches = query_count / adjusted_queries_per_batch;
64 
65       cs_move32_to(b, counter, full_batches);
66       cs_while(b, MALI_CS_CONDITION_GREATER, counter) {
67          cs_store(b, new_zero_regs, addr, BITFIELD_MASK(new_zero_regs.size), 0);
68          cs_add64(b, addr, addr, new_zero_regs.size * sizeof(uint32_t));
69          cs_add32(b, counter, counter, -1);
70       }
71 
72       remaining_queries =
73          query_count - (full_batches * adjusted_queries_per_batch);
74    }
75 
76    for (uint32_t i = 0; i < remaining_queries; i += queries_per_batch) {
77       struct cs_index new_zero_regs = cs_reg_tuple(
78          b, zero_regs.reg,
79          MIN2(remaining_queries - i, queries_per_batch) * regs_per_query);
80 
81       cs_store(b, new_zero_regs, addr, BITFIELD_MASK(new_zero_regs.size),
82                i * sizeof(uint32_t));
83    }
84 }
85 
86 static void
panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer * cmd,struct panvk_query_pool * pool,uint32_t first_query,uint32_t query_count)87 panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd,
88                                   struct panvk_query_pool *pool,
89                                   uint32_t first_query, uint32_t query_count)
90 {
91    struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT);
92 
93    /* Wait on deferred sync to ensure all prior query operations have
94     * completed
95     */
96    cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false);
97 
98    struct cs_index addr = cs_scratch_reg64(b, 16);
99    struct cs_index zero_regs = cs_scratch_reg_tuple(b, 0, 16);
100 
101    for (uint32_t i = 0; i < zero_regs.size; i += 2)
102       cs_move64_to(b, cs_scratch_reg64(b, i), 0);
103 
104    /* Zero all query syncobj so it reports non-available. We don't use
105     * cs_sync32_set() because no-one is waiting on this syncobj with
106     * cs_sync32_wait(). The only reason we use a syncobj is so we can
107     * defer the signalling in the issue_fragmnent_jobs() path. */
108    cs_move64_to(b, addr, panvk_query_available_dev_addr(pool, first_query));
109    reset_oq_batch(b, addr, zero_regs, query_count);
110 
111    cs_move64_to(b, addr, panvk_query_report_dev_addr(pool, first_query));
112    reset_oq_batch(b, addr, zero_regs, query_count);
113 
114    /* reset_oq_batch() only does the stores, we need to flush those explicitly
115     * here. */
116    cs_wait_slot(b, SB_ID(LS), false);
117 
118    /* We flush the caches to make the new value visible to the CPU. */
119    struct cs_index flush_id = cs_scratch_reg32(b, 0);
120 
121    cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, false,
122                    flush_id,
123                    cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
124    cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
125 }
126 
127 static void
panvk_cmd_begin_occlusion_query(struct panvk_cmd_buffer * cmd,struct panvk_query_pool * pool,uint32_t query,VkQueryControlFlags flags)128 panvk_cmd_begin_occlusion_query(struct panvk_cmd_buffer *cmd,
129                                 struct panvk_query_pool *pool, uint32_t query,
130                                 VkQueryControlFlags flags)
131 {
132    uint64_t report_addr = panvk_query_report_dev_addr(pool, query);
133 
134    cmd->state.gfx.occlusion_query.ptr = report_addr;
135    cmd->state.gfx.occlusion_query.syncobj =
136       panvk_query_available_dev_addr(pool, query);
137    cmd->state.gfx.occlusion_query.mode = flags & VK_QUERY_CONTROL_PRECISE_BIT
138                                             ? MALI_OCCLUSION_MODE_COUNTER
139                                             : MALI_OCCLUSION_MODE_PREDICATE;
140    gfx_state_set_dirty(cmd, OQ);
141 
142    /* From the Vulkan spec:
143     *
144     *   "When an occlusion query begins, the count of passing samples
145     *    always starts at zero."
146     *
147     */
148    struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT);
149 
150    struct cs_index report_addr_gpu = cs_scratch_reg64(b, 0);
151    struct cs_index clear_value = cs_scratch_reg64(b, 2);
152    cs_move64_to(b, report_addr_gpu, report_addr);
153    cs_move64_to(b, clear_value, 0);
154    cs_store64(b, clear_value, report_addr_gpu, 0);
155    cs_wait_slot(b, SB_ID(LS), false);
156 }
157 
158 static void
panvk_cmd_end_occlusion_query(struct panvk_cmd_buffer * cmd,struct panvk_query_pool * pool,uint32_t query)159 panvk_cmd_end_occlusion_query(struct panvk_cmd_buffer *cmd,
160                               struct panvk_query_pool *pool, uint32_t query)
161 {
162    uint64_t syncobj_addr = panvk_query_available_dev_addr(pool, query);
163 
164    cmd->state.gfx.occlusion_query.ptr = 0;
165    cmd->state.gfx.occlusion_query.syncobj = 0;
166    cmd->state.gfx.occlusion_query.mode = MALI_OCCLUSION_MODE_DISABLED;
167    gfx_state_set_dirty(cmd, OQ);
168 
169    /* If the render pass is active, we let EndRendering take care of the
170     * occlusion query end when the fragment job is issued. */
171    if (cmd->state.gfx.render.oq.last == syncobj_addr)
172       return;
173 
174    struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT);
175    struct cs_index oq_syncobj = cs_scratch_reg64(b, 0);
176    struct cs_index val = cs_scratch_reg32(b, 2);
177 
178    /* OQ accumulates sample counts to the report which is on a cached memory.
179     * Wait for the accumulation and flush the caches.
180     */
181    cs_move32_to(b, val, 0);
182    cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, false,
183                    val, cs_defer(SB_ALL_ITERS_MASK, SB_ID(DEFERRED_FLUSH)));
184 
185    /* Signal the query syncobj after the flush is effective. */
186    cs_move32_to(b, val, 1);
187    cs_move64_to(b, oq_syncobj, panvk_query_available_dev_addr(pool, query));
188    cs_sync32_set(b, true, MALI_CS_SYNC_SCOPE_CSG, val, oq_syncobj,
189                  cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC)));
190 }
191 
192 static void
copy_oq_result_batch(struct cs_builder * b,VkQueryResultFlags flags,struct cs_index dst_addr,VkDeviceSize dst_stride,struct cs_index res_addr,struct cs_index avail_addr,struct cs_index scratch_regs,uint32_t query_count)193 copy_oq_result_batch(struct cs_builder *b,
194                      VkQueryResultFlags flags,
195                      struct cs_index dst_addr,
196                      VkDeviceSize dst_stride,
197                      struct cs_index res_addr,
198                      struct cs_index avail_addr,
199                      struct cs_index scratch_regs,
200                      uint32_t query_count)
201 {
202    uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1;
203    uint32_t regs_per_copy =
204       res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0);
205 
206    assert(query_count <= scratch_regs.size / regs_per_copy);
207 
208    for (uint32_t i = 0; i < query_count; i++) {
209       struct cs_index res =
210          cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), res_size);
211       struct cs_index avail = cs_reg32(b, res.reg + res_size);
212 
213       cs_load_to(b, res, res_addr, BITFIELD_MASK(res.size),
214                  i * sizeof(uint64_t));
215 
216       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
217          cs_load32_to(b, avail, avail_addr, i * sizeof(struct panvk_cs_sync32));
218    }
219 
220    /* Flush the loads. */
221    cs_wait_slot(b, SB_ID(LS), false);
222 
223    for (uint32_t i = 0; i < query_count; i++) {
224       struct cs_index store_src =
225          cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), regs_per_copy);
226 
227       cs_store(b, store_src, dst_addr, BITFIELD_MASK(regs_per_copy),
228                i * dst_stride);
229    }
230 
231    /* Flush the stores. */
232    cs_wait_slot(b, SB_ID(LS), false);
233 }
234 
235 static void
panvk_copy_occlusion_query_results(struct panvk_cmd_buffer * cmd,struct panvk_query_pool * pool,uint32_t first_query,uint32_t query_count,uint64_t dst_buffer_addr,VkDeviceSize stride,VkQueryResultFlags flags)236 panvk_copy_occlusion_query_results(struct panvk_cmd_buffer *cmd,
237                                    struct panvk_query_pool *pool,
238                                    uint32_t first_query, uint32_t query_count,
239                                    uint64_t dst_buffer_addr,
240                                    VkDeviceSize stride,
241                                    VkQueryResultFlags flags)
242 {
243    struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT);
244 
245    /* Wait for occlusion query syncobjs to be signalled. */
246    if (flags & VK_QUERY_RESULT_WAIT_BIT)
247       cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false);
248 
249    uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1;
250    uint32_t regs_per_copy =
251       res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0);
252 
253    struct cs_index dst_addr = cs_scratch_reg64(b, 16);
254    struct cs_index res_addr = cs_scratch_reg64(b, 14);
255    struct cs_index avail_addr = cs_scratch_reg64(b, 12);
256    struct cs_index counter = cs_scratch_reg32(b, 11);
257    struct cs_index scratch_regs = cs_scratch_reg_tuple(b, 0, 11);
258    uint32_t queries_per_batch = scratch_regs.size / regs_per_copy;
259 
260    /* Store offset is a 16-bit signed integer, so we might be limited by the
261     * stride here. */
262    queries_per_batch = MIN2(((1u << 15) / stride) + 1, queries_per_batch);
263 
264    /* Stop unrolling the loop when it takes more than 2 steps to copy the
265     * queries. */
266    if (query_count > 2 * queries_per_batch) {
267       uint32_t copied_query_count =
268          query_count - (query_count % queries_per_batch);
269 
270       cs_move32_to(b, counter, copied_query_count);
271       cs_move64_to(b, dst_addr, dst_buffer_addr);
272       cs_move64_to(b, res_addr, panvk_query_report_dev_addr(pool, first_query));
273       cs_move64_to(b, avail_addr,
274                    panvk_query_available_dev_addr(pool, first_query));
275       cs_while(b, MALI_CS_CONDITION_GREATER, counter) {
276          copy_oq_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr,
277                               scratch_regs, queries_per_batch);
278 
279          cs_add32(b, counter, counter, -queries_per_batch);
280          cs_add64(b, dst_addr, dst_addr, queries_per_batch * stride);
281          cs_add64(b, res_addr, res_addr, queries_per_batch * sizeof(uint64_t));
282          cs_add64(b, avail_addr, avail_addr,
283                   queries_per_batch * sizeof(uint64_t));
284       }
285 
286       dst_buffer_addr += stride * copied_query_count;
287       first_query += copied_query_count;
288       query_count -= copied_query_count;
289    }
290 
291    for (uint32_t i = 0; i < query_count; i += queries_per_batch) {
292       cs_move64_to(b, dst_addr, dst_buffer_addr + (i * stride));
293       cs_move64_to(b, res_addr,
294                    panvk_query_report_dev_addr(pool, i + first_query));
295       cs_move64_to(b, avail_addr,
296                    panvk_query_available_dev_addr(pool, i + first_query));
297       copy_oq_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr,
298                            scratch_regs,
299                            MIN2(queries_per_batch, query_count - i));
300    }
301 }
302 
303 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdResetQueryPool)304 panvk_per_arch(CmdResetQueryPool)(VkCommandBuffer commandBuffer,
305                                   VkQueryPool queryPool, uint32_t firstQuery,
306                                   uint32_t queryCount)
307 {
308    VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
309    VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
310 
311    if (queryCount == 0)
312       return;
313 
314    switch (pool->vk.query_type) {
315    case VK_QUERY_TYPE_OCCLUSION: {
316       panvk_cmd_reset_occlusion_queries(cmd, pool, firstQuery, queryCount);
317       break;
318    }
319    default:
320       unreachable("Unsupported query type");
321    }
322 }
323 
324 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBeginQueryIndexedEXT)325 panvk_per_arch(CmdBeginQueryIndexedEXT)(VkCommandBuffer commandBuffer,
326                                         VkQueryPool queryPool, uint32_t query,
327                                         VkQueryControlFlags flags,
328                                         uint32_t index)
329 {
330    VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
331    VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
332 
333    /* TODO: transform feedback */
334    assert(index == 0);
335 
336    switch (pool->vk.query_type) {
337    case VK_QUERY_TYPE_OCCLUSION: {
338       panvk_cmd_begin_occlusion_query(cmd, pool, query, flags);
339       break;
340    }
341    default:
342       unreachable("Unsupported query type");
343    }
344 }
345 
346 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndQueryIndexedEXT)347 panvk_per_arch(CmdEndQueryIndexedEXT)(VkCommandBuffer commandBuffer,
348                                       VkQueryPool queryPool, uint32_t query,
349                                       uint32_t index)
350 {
351    VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
352    VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
353 
354    /* TODO: transform feedback */
355    assert(index == 0);
356 
357    switch (pool->vk.query_type) {
358    case VK_QUERY_TYPE_OCCLUSION: {
359       panvk_cmd_end_occlusion_query(cmd, pool, query);
360       break;
361    }
362    default:
363       unreachable("Unsupported query type");
364    }
365 }
366 
367 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdWriteTimestamp2)368 panvk_per_arch(CmdWriteTimestamp2)(VkCommandBuffer commandBuffer,
369                                    VkPipelineStageFlags2 stage,
370                                    VkQueryPool queryPool, uint32_t query)
371 {
372    UNUSED VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
373    UNUSED VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
374 
375    panvk_stub();
376 }
377 
378 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdCopyQueryPoolResults)379 panvk_per_arch(CmdCopyQueryPoolResults)(
380    VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
381    uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset,
382    VkDeviceSize stride, VkQueryResultFlags flags)
383 {
384    VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
385    VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
386    VK_FROM_HANDLE(panvk_buffer, dst_buffer, dstBuffer);
387 
388    uint64_t dst_buffer_addr = panvk_buffer_gpu_ptr(dst_buffer, dstOffset);
389 
390    switch (pool->vk.query_type) {
391    case VK_QUERY_TYPE_OCCLUSION: {
392       panvk_copy_occlusion_query_results(cmd, pool, firstQuery, queryCount,
393                                          dst_buffer_addr, stride, flags);
394       break;
395    }
396    default:
397       unreachable("Unsupported query type");
398    }
399 }
400