1 /*
2 * Copyright © 2024 Collabora Ltd.
3 * SPDX-License-Identifier: MIT
4 */
5 #include <stdint.h>
6 #include "util/os_time.h"
7
8 #include "vk_log.h"
9 #include "vk_synchronization.h"
10
11 #include "genxml/gen_macros.h"
12
13 #include "panvk_buffer.h"
14 #include "panvk_cmd_buffer.h"
15 #include "panvk_cmd_meta.h"
16 #include "panvk_device.h"
17 #include "panvk_entrypoints.h"
18 #include "panvk_macros.h"
19 #include "panvk_query_pool.h"
20
21 /* At the API level, a query consists of a status and a result. Both are
22 * uninitialized initially. There are these query operations:
23 *
24 * - Reset op sets the status to unavailable and leaves the result undefined.
25 * - Begin/End pair or Write op sets the status to available and the result
26 * to the final query value. Because of VK_QUERY_RESULT_PARTIAL_BIT, the
27 * result must hold valid intermediate query values while the query is
28 * active.
29 * - Copy op copies the result and optionally the status to a buffer.
30 *
31 * All query operations define execution dependencies among themselves when
32 * they reference the same queries. The only exception is the Copy op when
33 * VK_QUERY_RESULT_WAIT_BIT is not set.
34 *
35 * We use a panvk_cs_sync32 to store the status of a query:
36 *
37 * - Reset op waits on all prior query operations affecting the query before
38 * setting the seqno to 0 synchronously.
39 * - Begin op does not access the seqno.
40 * - End or Write op sets the seqno to 1 asynchronously.
41 * - Copy op waits on the seqno only when VK_QUERY_RESULT_WAIT_BIT is set.
42 *
43 * Because Reset op acts as a full barrier, End or Write op knows the seqno is
44 * 0 and does not need to wait.
45 */
46
47 static void
reset_oq_batch(struct cs_builder * b,struct cs_index addr,struct cs_index zero_regs,uint32_t query_count)48 reset_oq_batch(struct cs_builder *b, struct cs_index addr,
49 struct cs_index zero_regs, uint32_t query_count)
50 {
51 const uint32_t regs_per_query = 2;
52 const uint32_t queries_per_batch = zero_regs.size / regs_per_query;
53 uint32_t remaining_queries = query_count;
54
55 assert(zero_regs.size > 2 && ALIGN_POT(zero_regs.size, 2) == zero_regs.size);
56
57 if (query_count > queries_per_batch * 4) {
58 struct cs_index counter = cs_reg32(b, zero_regs.reg + zero_regs.size - 1);
59 struct cs_index new_zero_regs =
60 cs_reg_tuple(b, zero_regs.reg, zero_regs.size - 2);
61 const uint32_t adjusted_queries_per_batch =
62 new_zero_regs.size / regs_per_query;
63 uint32_t full_batches = query_count / adjusted_queries_per_batch;
64
65 cs_move32_to(b, counter, full_batches);
66 cs_while(b, MALI_CS_CONDITION_GREATER, counter) {
67 cs_store(b, new_zero_regs, addr, BITFIELD_MASK(new_zero_regs.size), 0);
68 cs_add64(b, addr, addr, new_zero_regs.size * sizeof(uint32_t));
69 cs_add32(b, counter, counter, -1);
70 }
71
72 remaining_queries =
73 query_count - (full_batches * adjusted_queries_per_batch);
74 }
75
76 for (uint32_t i = 0; i < remaining_queries; i += queries_per_batch) {
77 struct cs_index new_zero_regs = cs_reg_tuple(
78 b, zero_regs.reg,
79 MIN2(remaining_queries - i, queries_per_batch) * regs_per_query);
80
81 cs_store(b, new_zero_regs, addr, BITFIELD_MASK(new_zero_regs.size),
82 i * sizeof(uint32_t));
83 }
84 }
85
86 static void
panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer * cmd,struct panvk_query_pool * pool,uint32_t first_query,uint32_t query_count)87 panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd,
88 struct panvk_query_pool *pool,
89 uint32_t first_query, uint32_t query_count)
90 {
91 struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT);
92
93 /* Wait on deferred sync to ensure all prior query operations have
94 * completed
95 */
96 cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false);
97
98 struct cs_index addr = cs_scratch_reg64(b, 16);
99 struct cs_index zero_regs = cs_scratch_reg_tuple(b, 0, 16);
100
101 for (uint32_t i = 0; i < zero_regs.size; i += 2)
102 cs_move64_to(b, cs_scratch_reg64(b, i), 0);
103
104 /* Zero all query syncobj so it reports non-available. We don't use
105 * cs_sync32_set() because no-one is waiting on this syncobj with
106 * cs_sync32_wait(). The only reason we use a syncobj is so we can
107 * defer the signalling in the issue_fragmnent_jobs() path. */
108 cs_move64_to(b, addr, panvk_query_available_dev_addr(pool, first_query));
109 reset_oq_batch(b, addr, zero_regs, query_count);
110
111 cs_move64_to(b, addr, panvk_query_report_dev_addr(pool, first_query));
112 reset_oq_batch(b, addr, zero_regs, query_count);
113
114 /* reset_oq_batch() only does the stores, we need to flush those explicitly
115 * here. */
116 cs_wait_slot(b, SB_ID(LS), false);
117
118 /* We flush the caches to make the new value visible to the CPU. */
119 struct cs_index flush_id = cs_scratch_reg32(b, 0);
120
121 cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, false,
122 flush_id,
123 cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
124 cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
125 }
126
127 static void
panvk_cmd_begin_occlusion_query(struct panvk_cmd_buffer * cmd,struct panvk_query_pool * pool,uint32_t query,VkQueryControlFlags flags)128 panvk_cmd_begin_occlusion_query(struct panvk_cmd_buffer *cmd,
129 struct panvk_query_pool *pool, uint32_t query,
130 VkQueryControlFlags flags)
131 {
132 uint64_t report_addr = panvk_query_report_dev_addr(pool, query);
133
134 cmd->state.gfx.occlusion_query.ptr = report_addr;
135 cmd->state.gfx.occlusion_query.syncobj =
136 panvk_query_available_dev_addr(pool, query);
137 cmd->state.gfx.occlusion_query.mode = flags & VK_QUERY_CONTROL_PRECISE_BIT
138 ? MALI_OCCLUSION_MODE_COUNTER
139 : MALI_OCCLUSION_MODE_PREDICATE;
140 gfx_state_set_dirty(cmd, OQ);
141
142 /* From the Vulkan spec:
143 *
144 * "When an occlusion query begins, the count of passing samples
145 * always starts at zero."
146 *
147 */
148 struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT);
149
150 struct cs_index report_addr_gpu = cs_scratch_reg64(b, 0);
151 struct cs_index clear_value = cs_scratch_reg64(b, 2);
152 cs_move64_to(b, report_addr_gpu, report_addr);
153 cs_move64_to(b, clear_value, 0);
154 cs_store64(b, clear_value, report_addr_gpu, 0);
155 cs_wait_slot(b, SB_ID(LS), false);
156 }
157
158 static void
panvk_cmd_end_occlusion_query(struct panvk_cmd_buffer * cmd,struct panvk_query_pool * pool,uint32_t query)159 panvk_cmd_end_occlusion_query(struct panvk_cmd_buffer *cmd,
160 struct panvk_query_pool *pool, uint32_t query)
161 {
162 uint64_t syncobj_addr = panvk_query_available_dev_addr(pool, query);
163
164 cmd->state.gfx.occlusion_query.ptr = 0;
165 cmd->state.gfx.occlusion_query.syncobj = 0;
166 cmd->state.gfx.occlusion_query.mode = MALI_OCCLUSION_MODE_DISABLED;
167 gfx_state_set_dirty(cmd, OQ);
168
169 /* If the render pass is active, we let EndRendering take care of the
170 * occlusion query end when the fragment job is issued. */
171 if (cmd->state.gfx.render.oq.last == syncobj_addr)
172 return;
173
174 struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT);
175 struct cs_index oq_syncobj = cs_scratch_reg64(b, 0);
176 struct cs_index val = cs_scratch_reg32(b, 2);
177
178 /* OQ accumulates sample counts to the report which is on a cached memory.
179 * Wait for the accumulation and flush the caches.
180 */
181 cs_move32_to(b, val, 0);
182 cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, false,
183 val, cs_defer(SB_ALL_ITERS_MASK, SB_ID(DEFERRED_FLUSH)));
184
185 /* Signal the query syncobj after the flush is effective. */
186 cs_move32_to(b, val, 1);
187 cs_move64_to(b, oq_syncobj, panvk_query_available_dev_addr(pool, query));
188 cs_sync32_set(b, true, MALI_CS_SYNC_SCOPE_CSG, val, oq_syncobj,
189 cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC)));
190 }
191
192 static void
copy_oq_result_batch(struct cs_builder * b,VkQueryResultFlags flags,struct cs_index dst_addr,VkDeviceSize dst_stride,struct cs_index res_addr,struct cs_index avail_addr,struct cs_index scratch_regs,uint32_t query_count)193 copy_oq_result_batch(struct cs_builder *b,
194 VkQueryResultFlags flags,
195 struct cs_index dst_addr,
196 VkDeviceSize dst_stride,
197 struct cs_index res_addr,
198 struct cs_index avail_addr,
199 struct cs_index scratch_regs,
200 uint32_t query_count)
201 {
202 uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1;
203 uint32_t regs_per_copy =
204 res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0);
205
206 assert(query_count <= scratch_regs.size / regs_per_copy);
207
208 for (uint32_t i = 0; i < query_count; i++) {
209 struct cs_index res =
210 cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), res_size);
211 struct cs_index avail = cs_reg32(b, res.reg + res_size);
212
213 cs_load_to(b, res, res_addr, BITFIELD_MASK(res.size),
214 i * sizeof(uint64_t));
215
216 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
217 cs_load32_to(b, avail, avail_addr, i * sizeof(struct panvk_cs_sync32));
218 }
219
220 /* Flush the loads. */
221 cs_wait_slot(b, SB_ID(LS), false);
222
223 for (uint32_t i = 0; i < query_count; i++) {
224 struct cs_index store_src =
225 cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), regs_per_copy);
226
227 cs_store(b, store_src, dst_addr, BITFIELD_MASK(regs_per_copy),
228 i * dst_stride);
229 }
230
231 /* Flush the stores. */
232 cs_wait_slot(b, SB_ID(LS), false);
233 }
234
235 static void
panvk_copy_occlusion_query_results(struct panvk_cmd_buffer * cmd,struct panvk_query_pool * pool,uint32_t first_query,uint32_t query_count,uint64_t dst_buffer_addr,VkDeviceSize stride,VkQueryResultFlags flags)236 panvk_copy_occlusion_query_results(struct panvk_cmd_buffer *cmd,
237 struct panvk_query_pool *pool,
238 uint32_t first_query, uint32_t query_count,
239 uint64_t dst_buffer_addr,
240 VkDeviceSize stride,
241 VkQueryResultFlags flags)
242 {
243 struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT);
244
245 /* Wait for occlusion query syncobjs to be signalled. */
246 if (flags & VK_QUERY_RESULT_WAIT_BIT)
247 cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false);
248
249 uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1;
250 uint32_t regs_per_copy =
251 res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0);
252
253 struct cs_index dst_addr = cs_scratch_reg64(b, 16);
254 struct cs_index res_addr = cs_scratch_reg64(b, 14);
255 struct cs_index avail_addr = cs_scratch_reg64(b, 12);
256 struct cs_index counter = cs_scratch_reg32(b, 11);
257 struct cs_index scratch_regs = cs_scratch_reg_tuple(b, 0, 11);
258 uint32_t queries_per_batch = scratch_regs.size / regs_per_copy;
259
260 /* Store offset is a 16-bit signed integer, so we might be limited by the
261 * stride here. */
262 queries_per_batch = MIN2(((1u << 15) / stride) + 1, queries_per_batch);
263
264 /* Stop unrolling the loop when it takes more than 2 steps to copy the
265 * queries. */
266 if (query_count > 2 * queries_per_batch) {
267 uint32_t copied_query_count =
268 query_count - (query_count % queries_per_batch);
269
270 cs_move32_to(b, counter, copied_query_count);
271 cs_move64_to(b, dst_addr, dst_buffer_addr);
272 cs_move64_to(b, res_addr, panvk_query_report_dev_addr(pool, first_query));
273 cs_move64_to(b, avail_addr,
274 panvk_query_available_dev_addr(pool, first_query));
275 cs_while(b, MALI_CS_CONDITION_GREATER, counter) {
276 copy_oq_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr,
277 scratch_regs, queries_per_batch);
278
279 cs_add32(b, counter, counter, -queries_per_batch);
280 cs_add64(b, dst_addr, dst_addr, queries_per_batch * stride);
281 cs_add64(b, res_addr, res_addr, queries_per_batch * sizeof(uint64_t));
282 cs_add64(b, avail_addr, avail_addr,
283 queries_per_batch * sizeof(uint64_t));
284 }
285
286 dst_buffer_addr += stride * copied_query_count;
287 first_query += copied_query_count;
288 query_count -= copied_query_count;
289 }
290
291 for (uint32_t i = 0; i < query_count; i += queries_per_batch) {
292 cs_move64_to(b, dst_addr, dst_buffer_addr + (i * stride));
293 cs_move64_to(b, res_addr,
294 panvk_query_report_dev_addr(pool, i + first_query));
295 cs_move64_to(b, avail_addr,
296 panvk_query_available_dev_addr(pool, i + first_query));
297 copy_oq_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr,
298 scratch_regs,
299 MIN2(queries_per_batch, query_count - i));
300 }
301 }
302
303 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdResetQueryPool)304 panvk_per_arch(CmdResetQueryPool)(VkCommandBuffer commandBuffer,
305 VkQueryPool queryPool, uint32_t firstQuery,
306 uint32_t queryCount)
307 {
308 VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
309 VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
310
311 if (queryCount == 0)
312 return;
313
314 switch (pool->vk.query_type) {
315 case VK_QUERY_TYPE_OCCLUSION: {
316 panvk_cmd_reset_occlusion_queries(cmd, pool, firstQuery, queryCount);
317 break;
318 }
319 default:
320 unreachable("Unsupported query type");
321 }
322 }
323
324 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBeginQueryIndexedEXT)325 panvk_per_arch(CmdBeginQueryIndexedEXT)(VkCommandBuffer commandBuffer,
326 VkQueryPool queryPool, uint32_t query,
327 VkQueryControlFlags flags,
328 uint32_t index)
329 {
330 VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
331 VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
332
333 /* TODO: transform feedback */
334 assert(index == 0);
335
336 switch (pool->vk.query_type) {
337 case VK_QUERY_TYPE_OCCLUSION: {
338 panvk_cmd_begin_occlusion_query(cmd, pool, query, flags);
339 break;
340 }
341 default:
342 unreachable("Unsupported query type");
343 }
344 }
345
346 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndQueryIndexedEXT)347 panvk_per_arch(CmdEndQueryIndexedEXT)(VkCommandBuffer commandBuffer,
348 VkQueryPool queryPool, uint32_t query,
349 uint32_t index)
350 {
351 VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
352 VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
353
354 /* TODO: transform feedback */
355 assert(index == 0);
356
357 switch (pool->vk.query_type) {
358 case VK_QUERY_TYPE_OCCLUSION: {
359 panvk_cmd_end_occlusion_query(cmd, pool, query);
360 break;
361 }
362 default:
363 unreachable("Unsupported query type");
364 }
365 }
366
367 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdWriteTimestamp2)368 panvk_per_arch(CmdWriteTimestamp2)(VkCommandBuffer commandBuffer,
369 VkPipelineStageFlags2 stage,
370 VkQueryPool queryPool, uint32_t query)
371 {
372 UNUSED VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
373 UNUSED VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
374
375 panvk_stub();
376 }
377
378 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdCopyQueryPoolResults)379 panvk_per_arch(CmdCopyQueryPoolResults)(
380 VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
381 uint32_t queryCount, VkBuffer dstBuffer, VkDeviceSize dstOffset,
382 VkDeviceSize stride, VkQueryResultFlags flags)
383 {
384 VK_FROM_HANDLE(panvk_cmd_buffer, cmd, commandBuffer);
385 VK_FROM_HANDLE(panvk_query_pool, pool, queryPool);
386 VK_FROM_HANDLE(panvk_buffer, dst_buffer, dstBuffer);
387
388 uint64_t dst_buffer_addr = panvk_buffer_gpu_ptr(dst_buffer, dstOffset);
389
390 switch (pool->vk.query_type) {
391 case VK_QUERY_TYPE_OCCLUSION: {
392 panvk_copy_occlusion_query_results(cmd, pool, firstQuery, queryCount,
393 dst_buffer_addr, stride, flags);
394 break;
395 }
396 default:
397 unreachable("Unsupported query type");
398 }
399 }
400