1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_query_pool.h"
6
7 #include "nvk_buffer.h"
8 #include "nvk_cmd_buffer.h"
9 #include "nvk_device.h"
10 #include "nvk_entrypoints.h"
11 #include "nvk_event.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14
15 #include "vk_meta.h"
16 #include "vk_pipeline.h"
17
18 #include "compiler/nir/nir.h"
19 #include "compiler/nir/nir_builder.h"
20
21 #include "nouveau_bo.h"
22 #include "nouveau_context.h"
23
24 #include "util/os_time.h"
25
26 #include "nvk_cl906f.h"
27 #include "nvk_cl9097.h"
28 #include "nvk_cla0c0.h"
29 #include "nvk_clc597.h"
30
31 struct nvk_query_report {
32 uint64_t value;
33 uint64_t timestamp;
34 };
35
36 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateQueryPool(VkDevice device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)37 nvk_CreateQueryPool(VkDevice device,
38 const VkQueryPoolCreateInfo *pCreateInfo,
39 const VkAllocationCallbacks *pAllocator,
40 VkQueryPool *pQueryPool)
41 {
42 VK_FROM_HANDLE(nvk_device, dev, device);
43 struct nvk_query_pool *pool;
44
45 pool = vk_query_pool_create(&dev->vk, pCreateInfo,
46 pAllocator, sizeof(*pool));
47 if (!pool)
48 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
49
50 /* We place the availability first and then data */
51 pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
52 sizeof(struct nvk_query_report));
53
54 uint32_t reports_per_query;
55 switch (pCreateInfo->queryType) {
56 case VK_QUERY_TYPE_OCCLUSION:
57 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
58 reports_per_query = 2;
59 break;
60 case VK_QUERY_TYPE_TIMESTAMP:
61 reports_per_query = 1;
62 break;
63 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
64 reports_per_query = 2 * util_bitcount(pool->vk.pipeline_statistics);
65 break;
66 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
67 // 2 for primitives succeeded 2 for primitives needed
68 reports_per_query = 4;
69 break;
70 default:
71 unreachable("Unsupported query type");
72 }
73 pool->query_stride = reports_per_query * sizeof(struct nvk_query_report);
74
75 if (pool->vk.query_count > 0) {
76 uint32_t bo_size = pool->query_start +
77 pool->query_stride * pool->vk.query_count;
78 pool->bo = nouveau_ws_bo_new_mapped(dev->ws_dev, bo_size, 0,
79 NOUVEAU_WS_BO_GART |
80 NOUVEAU_WS_BO_NO_SHARE,
81 NOUVEAU_WS_BO_RDWR,
82 &pool->bo_map);
83 if (!pool->bo) {
84 vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
85 return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
86 }
87
88 if (dev->ws_dev->debug_flags & NVK_DEBUG_ZERO_MEMORY)
89 memset(pool->bo_map, 0, bo_size);
90 }
91
92 *pQueryPool = nvk_query_pool_to_handle(pool);
93
94 return VK_SUCCESS;
95 }
96
97 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyQueryPool(VkDevice device,VkQueryPool queryPool,const VkAllocationCallbacks * pAllocator)98 nvk_DestroyQueryPool(VkDevice device,
99 VkQueryPool queryPool,
100 const VkAllocationCallbacks *pAllocator)
101 {
102 VK_FROM_HANDLE(nvk_device, dev, device);
103 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
104
105 if (!pool)
106 return;
107
108 if (pool->bo) {
109 nouveau_ws_bo_unmap(pool->bo, pool->bo_map);
110 nouveau_ws_bo_destroy(pool->bo);
111 }
112 vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
113 }
114
115 static uint64_t
nvk_query_available_addr(struct nvk_query_pool * pool,uint32_t query)116 nvk_query_available_addr(struct nvk_query_pool *pool, uint32_t query)
117 {
118 assert(query < pool->vk.query_count);
119 return pool->bo->offset + query * sizeof(uint32_t);
120 }
121
122 static nir_def *
nvk_nir_available_addr(nir_builder * b,nir_def * pool_addr,nir_def * query)123 nvk_nir_available_addr(nir_builder *b, nir_def *pool_addr,
124 nir_def *query)
125 {
126 nir_def *offset = nir_imul_imm(b, query, sizeof(uint32_t));
127 return nir_iadd(b, pool_addr, nir_u2u64(b, offset));
128 }
129
130 static uint32_t *
nvk_query_available_map(struct nvk_query_pool * pool,uint32_t query)131 nvk_query_available_map(struct nvk_query_pool *pool, uint32_t query)
132 {
133 assert(query < pool->vk.query_count);
134 return (uint32_t *)pool->bo_map + query;
135 }
136
137 static uint64_t
nvk_query_offset(struct nvk_query_pool * pool,uint32_t query)138 nvk_query_offset(struct nvk_query_pool *pool, uint32_t query)
139 {
140 assert(query < pool->vk.query_count);
141 return pool->query_start + query * pool->query_stride;
142 }
143
144 static uint64_t
nvk_query_report_addr(struct nvk_query_pool * pool,uint32_t query)145 nvk_query_report_addr(struct nvk_query_pool *pool, uint32_t query)
146 {
147 return pool->bo->offset + nvk_query_offset(pool, query);
148 }
149
150 static nir_def *
nvk_nir_query_report_addr(nir_builder * b,nir_def * pool_addr,nir_def * query_start,nir_def * query_stride,nir_def * query)151 nvk_nir_query_report_addr(nir_builder *b, nir_def *pool_addr,
152 nir_def *query_start, nir_def *query_stride,
153 nir_def *query)
154 {
155 nir_def *offset =
156 nir_iadd(b, query_start, nir_umul_2x32_64(b, query, query_stride));
157 return nir_iadd(b, pool_addr, offset);
158 }
159
160 static struct nvk_query_report *
nvk_query_report_map(struct nvk_query_pool * pool,uint32_t query)161 nvk_query_report_map(struct nvk_query_pool *pool, uint32_t query)
162 {
163 return (void *)((char *)pool->bo_map + nvk_query_offset(pool, query));
164 }
165
166 /**
167 * Goes through a series of consecutive query indices in the given pool,
168 * setting all element values to 0 and emitting them as available.
169 */
170 static void
emit_zero_queries(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t first_index,uint32_t num_queries)171 emit_zero_queries(struct nvk_cmd_buffer *cmd, struct nvk_query_pool *pool,
172 uint32_t first_index, uint32_t num_queries)
173 {
174 switch (pool->vk.query_type) {
175 case VK_QUERY_TYPE_OCCLUSION:
176 case VK_QUERY_TYPE_TIMESTAMP:
177 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
178 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
179 for (uint32_t i = 0; i < num_queries; i++) {
180 uint64_t addr = nvk_query_available_addr(pool, first_index + i);
181
182 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
183 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
184 P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
185 P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
186 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
187 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
188 .operation = OPERATION_RELEASE,
189 .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
190 .pipeline_location = PIPELINE_LOCATION_ALL,
191 .structure_size = STRUCTURE_SIZE_ONE_WORD,
192 });
193 }
194 break;
195 }
196 default:
197 unreachable("Unsupported query type");
198 }
199 }
200
201 VKAPI_ATTR void VKAPI_CALL
nvk_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)202 nvk_ResetQueryPool(VkDevice device,
203 VkQueryPool queryPool,
204 uint32_t firstQuery,
205 uint32_t queryCount)
206 {
207 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
208
209 uint32_t *available = nvk_query_available_map(pool, firstQuery);
210 memset(available, 0, queryCount * sizeof(*available));
211 }
212
213 VKAPI_ATTR void VKAPI_CALL
nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)214 nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
215 VkQueryPool queryPool,
216 uint32_t firstQuery,
217 uint32_t queryCount)
218 {
219 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
220 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
221
222 for (uint32_t i = 0; i < queryCount; i++) {
223 uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
224
225 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
226 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
227 P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
228 P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
229 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
230 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
231 .operation = OPERATION_RELEASE,
232 .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
233 .pipeline_location = PIPELINE_LOCATION_ALL,
234 .structure_size = STRUCTURE_SIZE_ONE_WORD,
235 });
236 }
237
238 /* Wait for the above writes to complete. This prevents WaW hazards on any
239 * later query availability updates and ensures vkCmdCopyQueryPoolResults
240 * will see the query as unavailable if it happens before the query is
241 * completed again.
242 */
243 for (uint32_t i = 0; i < queryCount; i++) {
244 uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
245
246 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
247 __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
248 P_NV906F_SEMAPHOREA(p, addr >> 32);
249 P_NV906F_SEMAPHOREB(p, (addr & UINT32_MAX) >> 2);
250 P_NV906F_SEMAPHOREC(p, 0);
251 P_NV906F_SEMAPHORED(p, {
252 .operation = OPERATION_ACQUIRE,
253 .acquire_switch = ACQUIRE_SWITCH_ENABLED,
254 .release_size = RELEASE_SIZE_4BYTE,
255 });
256 }
257 }
258
259 VKAPI_ATTR void VKAPI_CALL
nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)260 nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
261 VkPipelineStageFlags2 stage,
262 VkQueryPool queryPool,
263 uint32_t query)
264 {
265 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
266 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
267
268 struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
269
270 uint64_t report_addr = nvk_query_report_addr(pool, query);
271 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
272 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
273 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
274 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
275 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
276 .operation = OPERATION_REPORT_ONLY,
277 .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
278 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
279 });
280
281 uint64_t available_addr = nvk_query_available_addr(pool, query);
282 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
283 P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
284 P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
285 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
286 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
287 .operation = OPERATION_RELEASE,
288 .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
289 .pipeline_location = PIPELINE_LOCATION_ALL,
290 .structure_size = STRUCTURE_SIZE_ONE_WORD,
291 });
292
293 /* From the Vulkan spec:
294 *
295 * "If vkCmdWriteTimestamp2 is called while executing a render pass
296 * instance that has multiview enabled, the timestamp uses N consecutive
297 * query indices in the query pool (starting at query) where N is the
298 * number of bits set in the view mask of the subpass the command is
299 * executed in. The resulting query values are determined by an
300 * implementation-dependent choice of one of the following behaviors:"
301 *
302 * In our case, only the first query is used, so we emit zeros for the
303 * remaining queries, as described in the first behavior listed in the
304 * Vulkan spec:
305 *
306 * "The first query is a timestamp value and (if more than one bit is set
307 * in the view mask) zero is written to the remaining queries."
308 */
309 if (cmd->state.gfx.render.view_mask != 0) {
310 const uint32_t num_queries =
311 util_bitcount(cmd->state.gfx.render.view_mask);
312 if (num_queries > 1)
313 emit_zero_queries(cmd, pool, query + 1, num_queries - 1);
314 }
315 }
316
317 struct nvk_3d_stat_query {
318 VkQueryPipelineStatisticFlagBits flag;
319 uint8_t loc;
320 uint8_t report;
321 };
322
323 /* This must remain sorted in flag order */
324 static const struct nvk_3d_stat_query nvk_3d_stat_queries[] = {{
325 .flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT,
326 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
327 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED,
328 }, {
329 .flag = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
330 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
331 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED,
332 }, {
333 .flag = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT,
334 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER,
335 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS,
336 }, {
337 .flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT,
338 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
339 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS,
340 }, {
341 .flag = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT,
342 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
343 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED,
344 }, {
345 .flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
346 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
347 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS,
348 }, {
349 .flag = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
350 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
351 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED,
352 }, {
353 .flag = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
354 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER,
355 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS,
356 }, {
357 .flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
358 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER,
359 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS,
360 }, {
361 .flag = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
362 .loc = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER,
363 .report = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS,
364 }, {
365 .flag = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT,
366 .loc = UINT8_MAX,
367 .report = UINT8_MAX,
368 }};
369
370 static void
mme_store_global(struct mme_builder * b,struct mme_value64 addr,struct mme_value v)371 mme_store_global(struct mme_builder *b,
372 struct mme_value64 addr,
373 struct mme_value v)
374 {
375 mme_mthd(b, NV9097_SET_REPORT_SEMAPHORE_A);
376 mme_emit_addr64(b, addr);
377 mme_emit(b, v);
378 mme_emit(b, mme_imm(0x10000000));
379 }
380
381 void
nvk_mme_write_cs_invocations(struct mme_builder * b)382 nvk_mme_write_cs_invocations(struct mme_builder *b)
383 {
384 struct mme_value64 dst_addr = mme_load_addr64(b);
385
386 struct mme_value accum_hi = mme_state(b,
387 NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI));
388 struct mme_value accum_lo = mme_state(b,
389 NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO));
390 struct mme_value64 accum = mme_value64(accum_lo, accum_hi);
391
392 mme_store_global(b, dst_addr, accum.lo);
393 mme_store_global(b, mme_add64(b, dst_addr, mme_imm64(4)), accum.hi);
394 }
395
396 static void
nvk_cmd_begin_end_query(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t query,uint32_t index,bool end)397 nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
398 struct nvk_query_pool *pool,
399 uint32_t query, uint32_t index,
400 bool end)
401 {
402 uint64_t report_addr = nvk_query_report_addr(pool, query) +
403 end * sizeof(struct nvk_query_report);
404
405 uint32_t end_size = 7 * end;
406
407 struct nv_push *p;
408 switch (pool->vk.query_type) {
409 case VK_QUERY_TYPE_OCCLUSION:
410 p = nvk_cmd_buffer_push(cmd, 7 + end_size);
411
412 P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, !end);
413
414 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
415 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
416 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
417 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
418 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
419 .operation = OPERATION_REPORT_ONLY,
420 .pipeline_location = PIPELINE_LOCATION_ALL,
421 .report = REPORT_ZPASS_PIXEL_CNT64,
422 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
423 .flush_disable = true,
424 });
425 break;
426
427 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
428 uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
429 p = nvk_cmd_buffer_push(cmd, stat_count * 5 + end_size);
430
431 ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics;
432 for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) {
433 const struct nvk_3d_stat_query *sq = &nvk_3d_stat_queries[i];
434 if (!(stats_left & sq->flag))
435 continue;
436
437 /* The 3D stat queries array MUST be sorted */
438 assert(!(stats_left & (sq->flag - 1)));
439
440 if (sq->flag == VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT) {
441 P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
442 P_INLINE_DATA(p, report_addr >> 32);
443 P_INLINE_DATA(p, report_addr);
444 } else {
445 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
446 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
447 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
448 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
449 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
450 .operation = OPERATION_REPORT_ONLY,
451 .pipeline_location = sq->loc,
452 .report = sq->report,
453 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
454 .flush_disable = true,
455 });
456 }
457
458 report_addr += 2 * sizeof(struct nvk_query_report);
459 stats_left &= ~sq->flag;
460 }
461 break;
462 }
463
464 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
465 const uint32_t xfb_reports[] = {
466 NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_SUCCEEDED,
467 NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_NEEDED,
468 };
469 p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports) + end_size);
470 for (uint32_t i = 0; i < ARRAY_SIZE(xfb_reports); ++i) {
471 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
472 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
473 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
474 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
475 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
476 .operation = OPERATION_REPORT_ONLY,
477 .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
478 .report = xfb_reports[i],
479 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
480 .sub_report = index,
481 .flush_disable = true,
482 });
483 report_addr += 2 * sizeof(struct nvk_query_report);
484 }
485 break;
486 }
487
488 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
489 p = nvk_cmd_buffer_push(cmd, 5 + end_size);
490
491 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
492 P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
493 P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
494 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
495 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
496 .operation = OPERATION_REPORT_ONLY,
497 .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
498 .report = REPORT_VTG_PRIMITIVES_OUT,
499 .sub_report = index,
500 .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
501 .flush_disable = true,
502 });
503 break;
504
505 default:
506 unreachable("Unsupported query type");
507 }
508
509 if (end) {
510 P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
511
512 uint64_t available_addr = nvk_query_available_addr(pool, query);
513 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
514 P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
515 P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
516 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
517 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
518 .operation = OPERATION_RELEASE,
519 .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
520 .pipeline_location = PIPELINE_LOCATION_ALL,
521 .structure_size = STRUCTURE_SIZE_ONE_WORD,
522 });
523 }
524 }
525
526 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)527 nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
528 VkQueryPool queryPool,
529 uint32_t query,
530 VkQueryControlFlags flags,
531 uint32_t index)
532 {
533 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
534 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
535
536 nvk_cmd_begin_end_query(cmd, pool, query, index, false);
537 }
538
539 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)540 nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
541 VkQueryPool queryPool,
542 uint32_t query,
543 uint32_t index)
544 {
545 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
546 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
547
548 nvk_cmd_begin_end_query(cmd, pool, query, index, true);
549
550 /* From the Vulkan spec:
551 *
552 * "If queries are used while executing a render pass instance that has
553 * multiview enabled, the query uses N consecutive query indices in
554 * the query pool (starting at query) where N is the number of bits set
555 * in the view mask in the subpass the query is used in. How the
556 * numerical results of the query are distributed among the queries is
557 * implementation-dependent."
558 *
559 * In our case, only the first query is used, so we emit zeros for the
560 * remaining queries.
561 */
562 if (cmd->state.gfx.render.view_mask != 0) {
563 const uint32_t num_queries =
564 util_bitcount(cmd->state.gfx.render.view_mask);
565 if (num_queries > 1)
566 emit_zero_queries(cmd, pool, query + 1, num_queries - 1);
567 }
568 }
569
570 static bool
nvk_query_is_available(struct nvk_query_pool * pool,uint32_t query)571 nvk_query_is_available(struct nvk_query_pool *pool, uint32_t query)
572 {
573 uint32_t *available = nvk_query_available_map(pool, query);
574 return p_atomic_read(available) != 0;
575 }
576
577 #define NVK_QUERY_TIMEOUT 2000000000ull
578
579 static VkResult
nvk_query_wait_for_available(struct nvk_device * dev,struct nvk_query_pool * pool,uint32_t query)580 nvk_query_wait_for_available(struct nvk_device *dev,
581 struct nvk_query_pool *pool,
582 uint32_t query)
583 {
584 uint64_t abs_timeout_ns = os_time_get_absolute_timeout(NVK_QUERY_TIMEOUT);
585
586 while (os_time_get_nano() < abs_timeout_ns) {
587 if (nvk_query_is_available(pool, query))
588 return VK_SUCCESS;
589
590 VkResult status = vk_device_check_status(&dev->vk);
591 if (status != VK_SUCCESS)
592 return status;
593 }
594
595 return vk_device_set_lost(&dev->vk, "query timeout");
596 }
597
598 static void
cpu_write_query_result(void * dst,uint32_t idx,VkQueryResultFlags flags,uint64_t result)599 cpu_write_query_result(void *dst, uint32_t idx,
600 VkQueryResultFlags flags,
601 uint64_t result)
602 {
603 if (flags & VK_QUERY_RESULT_64_BIT) {
604 uint64_t *dst64 = dst;
605 dst64[idx] = result;
606 } else {
607 uint32_t *dst32 = dst;
608 dst32[idx] = result;
609 }
610 }
611
612 static void
cpu_get_query_delta(void * dst,const struct nvk_query_report * src,uint32_t idx,VkQueryResultFlags flags)613 cpu_get_query_delta(void *dst, const struct nvk_query_report *src,
614 uint32_t idx, VkQueryResultFlags flags)
615 {
616 uint64_t delta = src[idx * 2 + 1].value - src[idx * 2].value;
617 cpu_write_query_result(dst, idx, flags, delta);
618 }
619
620 VKAPI_ATTR VkResult VKAPI_CALL
nvk_GetQueryPoolResults(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)621 nvk_GetQueryPoolResults(VkDevice device,
622 VkQueryPool queryPool,
623 uint32_t firstQuery,
624 uint32_t queryCount,
625 size_t dataSize,
626 void *pData,
627 VkDeviceSize stride,
628 VkQueryResultFlags flags)
629 {
630 VK_FROM_HANDLE(nvk_device, dev, device);
631 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
632
633 if (vk_device_is_lost(&dev->vk))
634 return VK_ERROR_DEVICE_LOST;
635
636 VkResult status = VK_SUCCESS;
637 for (uint32_t i = 0; i < queryCount; i++) {
638 const uint32_t query = firstQuery + i;
639
640 bool available = nvk_query_is_available(pool, query);
641
642 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
643 status = nvk_query_wait_for_available(dev, pool, query);
644 if (status != VK_SUCCESS)
645 return status;
646
647 available = true;
648 }
649
650 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
651
652 const struct nvk_query_report *src = nvk_query_report_map(pool, query);
653 assert(i * stride < dataSize);
654 void *dst = (char *)pData + i * stride;
655
656 uint32_t available_dst_idx = 1;
657 switch (pool->vk.query_type) {
658 case VK_QUERY_TYPE_OCCLUSION:
659 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
660 if (write_results)
661 cpu_get_query_delta(dst, src, 0, flags);
662 break;
663 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
664 uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
665 available_dst_idx = stat_count;
666 if (write_results) {
667 for (uint32_t j = 0; j < stat_count; j++)
668 cpu_get_query_delta(dst, src, j, flags);
669 }
670 break;
671 }
672 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
673 const int prims_succeeded_idx = 0;
674 const int prims_needed_idx = 1;
675 available_dst_idx = 2;
676 if (write_results) {
677 cpu_get_query_delta(dst, src, prims_succeeded_idx, flags);
678 cpu_get_query_delta(dst, src, prims_needed_idx, flags);
679 }
680 break;
681 }
682 case VK_QUERY_TYPE_TIMESTAMP:
683 if (write_results)
684 cpu_write_query_result(dst, 0, flags, src->timestamp);
685 break;
686 default:
687 unreachable("Unsupported query type");
688 }
689
690 if (!write_results)
691 status = VK_NOT_READY;
692
693 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
694 cpu_write_query_result(dst, available_dst_idx, flags, available);
695 }
696
697 return status;
698 }
699
700 struct nvk_copy_query_push {
701 uint64_t pool_addr;
702 uint32_t query_start;
703 uint32_t query_stride;
704 uint32_t first_query;
705 uint32_t query_count;
706 uint64_t dst_addr;
707 uint64_t dst_stride;
708 uint32_t flags;
709 };
710
711 static nir_def *
load_struct_var(nir_builder * b,nir_variable * var,uint32_t field)712 load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
713 {
714 nir_deref_instr *deref =
715 nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
716 return nir_load_deref(b, deref);
717 }
718
719 static void
nir_write_query_result(nir_builder * b,nir_def * dst_addr,nir_def * idx,nir_def * flags,nir_def * result)720 nir_write_query_result(nir_builder *b, nir_def *dst_addr,
721 nir_def *idx, nir_def *flags,
722 nir_def *result)
723 {
724 assert(result->num_components == 1);
725 assert(result->bit_size == 64);
726
727 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
728 {
729 nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 8));
730 nir_store_global(b, nir_iadd(b, dst_addr, offset), 8, result, 0x1);
731 }
732 nir_push_else(b, NULL);
733 {
734 nir_def *result32 = nir_u2u32(b, result);
735 nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 4));
736 nir_store_global(b, nir_iadd(b, dst_addr, offset), 4, result32, 0x1);
737 }
738 nir_pop_if(b, NULL);
739 }
740
741 static void
nir_get_query_delta(nir_builder * b,nir_def * dst_addr,nir_def * report_addr,nir_def * idx,nir_def * flags)742 nir_get_query_delta(nir_builder *b, nir_def *dst_addr,
743 nir_def *report_addr, nir_def *idx,
744 nir_def *flags)
745 {
746 nir_def *offset =
747 nir_imul_imm(b, idx, 2 * sizeof(struct nvk_query_report));
748 nir_def *begin_addr =
749 nir_iadd(b, report_addr, nir_i2i64(b, offset));
750 nir_def *end_addr =
751 nir_iadd_imm(b, begin_addr, sizeof(struct nvk_query_report));
752
753 /* nvk_query_report::timestamp is the first uint64_t */
754 nir_def *begin = nir_load_global(b, begin_addr, 16, 1, 64);
755 nir_def *end = nir_load_global(b, end_addr, 16, 1, 64);
756
757 nir_def *delta = nir_isub(b, end, begin);
758
759 nir_write_query_result(b, dst_addr, idx, flags, delta);
760 }
761
762 static void
nvk_nir_copy_query(nir_builder * b,nir_variable * push,nir_def * i)763 nvk_nir_copy_query(nir_builder *b, nir_variable *push, nir_def *i)
764 {
765 nir_def *pool_addr = load_struct_var(b, push, 0);
766 nir_def *query_start = nir_u2u64(b, load_struct_var(b, push, 1));
767 nir_def *query_stride = load_struct_var(b, push, 2);
768 nir_def *first_query = load_struct_var(b, push, 3);
769 nir_def *dst_addr = load_struct_var(b, push, 5);
770 nir_def *dst_stride = load_struct_var(b, push, 6);
771 nir_def *flags = load_struct_var(b, push, 7);
772
773 nir_def *query = nir_iadd(b, first_query, i);
774
775 nir_def *avail_addr = nvk_nir_available_addr(b, pool_addr, query);
776 nir_def *available =
777 nir_i2b(b, nir_load_global(b, avail_addr, 4, 1, 32));
778
779 nir_def *partial = nir_test_mask(b, flags, VK_QUERY_RESULT_PARTIAL_BIT);
780 nir_def *write_results = nir_ior(b, available, partial);
781
782 nir_def *report_addr =
783 nvk_nir_query_report_addr(b, pool_addr, query_start, query_stride,
784 query);
785 nir_def *dst_offset = nir_imul(b, nir_u2u64(b, i), dst_stride);
786
787 /* Timestamp queries are the only ones use a single report */
788 nir_def *is_timestamp =
789 nir_ieq_imm(b, query_stride, sizeof(struct nvk_query_report));
790
791 nir_def *one = nir_imm_int(b, 1);
792 nir_def *num_reports;
793 nir_push_if(b, is_timestamp);
794 {
795 nir_push_if(b, write_results);
796 {
797 /* This is the timestamp case. We add 8 because we're loading
798 * nvk_query_report::timestamp.
799 */
800 nir_def *timestamp =
801 nir_load_global(b, nir_iadd_imm(b, report_addr, 8), 8, 1, 64);
802
803 nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
804 nir_imm_int(b, 0), flags, timestamp);
805 }
806 nir_pop_if(b, NULL);
807 }
808 nir_push_else(b, NULL);
809 {
810 /* Everything that isn't a timestamp has the invariant that the
811 * number of destination entries is equal to the query stride divided
812 * by the size of two reports.
813 */
814 num_reports = nir_udiv_imm(b, query_stride,
815 2 * sizeof(struct nvk_query_report));
816
817 nir_push_if(b, write_results);
818 {
819 nir_variable *r =
820 nir_local_variable_create(b->impl, glsl_uint_type(), "r");
821 nir_store_var(b, r, nir_imm_int(b, 0), 0x1);
822
823 nir_push_loop(b);
824 {
825 nir_push_if(b, nir_ige(b, nir_load_var(b, r), num_reports));
826 {
827 nir_jump(b, nir_jump_break);
828 }
829 nir_pop_if(b, NULL);
830
831 nir_get_query_delta(b, nir_iadd(b, dst_addr, dst_offset),
832 report_addr, nir_load_var(b, r), flags);
833
834 nir_store_var(b, r, nir_iadd_imm(b, nir_load_var(b, r), 1), 0x1);
835 }
836 nir_pop_loop(b, NULL);
837 }
838 nir_pop_if(b, NULL);
839 }
840 nir_pop_if(b, NULL);
841
842 num_reports = nir_if_phi(b, one, num_reports);
843
844 nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
845 {
846 nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
847 num_reports, flags, nir_b2i64(b, available));
848 }
849 nir_pop_if(b, NULL);
850 }
851
852 static nir_shader *
build_copy_queries_shader(void)853 build_copy_queries_shader(void)
854 {
855 nir_builder build =
856 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
857 "nvk-meta-copy-queries");
858 nir_builder *b = &build;
859
860 struct glsl_struct_field push_fields[] = {
861 { .type = glsl_uint64_t_type(), .name = "pool_addr", .offset = 0 },
862 { .type = glsl_uint_type(), .name = "query_start", .offset = 8 },
863 { .type = glsl_uint_type(), .name = "query_stride", .offset = 12 },
864 { .type = glsl_uint_type(), .name = "first_query", .offset = 16 },
865 { .type = glsl_uint_type(), .name = "query_count", .offset = 20 },
866 { .type = glsl_uint64_t_type(), .name = "dst_addr", .offset = 24 },
867 { .type = glsl_uint64_t_type(), .name = "dst_stride", .offset = 32 },
868 { .type = glsl_uint_type(), .name = "flags", .offset = 40 },
869 };
870 const struct glsl_type *push_iface_type =
871 glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
872 GLSL_INTERFACE_PACKING_STD140,
873 false /* row_major */, "push");
874 nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
875 push_iface_type, "push");
876
877 nir_def *query_count = load_struct_var(b, push, 4);
878
879 nir_variable *i = nir_local_variable_create(b->impl, glsl_uint_type(), "i");
880 nir_store_var(b, i, nir_imm_int(b, 0), 0x1);
881
882 nir_push_loop(b);
883 {
884 nir_push_if(b, nir_ige(b, nir_load_var(b, i), query_count));
885 {
886 nir_jump(b, nir_jump_break);
887 }
888 nir_pop_if(b, NULL);
889
890 nvk_nir_copy_query(b, push, nir_load_var(b, i));
891
892 nir_store_var(b, i, nir_iadd_imm(b, nir_load_var(b, i), 1), 0x1);
893 }
894 nir_pop_loop(b, NULL);
895
896 return build.shader;
897 }
898
899 static VkResult
get_copy_queries_pipeline(struct nvk_device * dev,VkPipelineLayout layout,VkPipeline * pipeline_out)900 get_copy_queries_pipeline(struct nvk_device *dev,
901 VkPipelineLayout layout,
902 VkPipeline *pipeline_out)
903 {
904 const char key[] = "nvk-meta-copy-query-pool-results";
905 VkPipeline cached = vk_meta_lookup_pipeline(&dev->meta, key, sizeof(key));
906 if (cached != VK_NULL_HANDLE) {
907 *pipeline_out = cached;
908 return VK_SUCCESS;
909 }
910
911 const VkPipelineShaderStageNirCreateInfoMESA nir_info = {
912 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_NIR_CREATE_INFO_MESA,
913 .nir = build_copy_queries_shader(),
914 };
915 const VkComputePipelineCreateInfo info = {
916 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
917 .stage = {
918 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
919 .pNext = &nir_info,
920 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
921 .pName = "main",
922 },
923 .layout = layout,
924 };
925
926 return vk_meta_create_compute_pipeline(&dev->vk, &dev->meta, &info,
927 key, sizeof(key), pipeline_out);
928 }
929
930 static void
nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t first_query,uint32_t query_count,uint64_t dst_addr,uint64_t dst_stride,VkQueryResultFlags flags)931 nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer *cmd,
932 struct nvk_query_pool *pool,
933 uint32_t first_query,
934 uint32_t query_count,
935 uint64_t dst_addr,
936 uint64_t dst_stride,
937 VkQueryResultFlags flags)
938 {
939 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
940 struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
941 VkResult result;
942
943 const struct nvk_copy_query_push push = {
944 .pool_addr = pool->bo->offset,
945 .query_start = pool->query_start,
946 .query_stride = pool->query_stride,
947 .first_query = first_query,
948 .query_count = query_count,
949 .dst_addr = dst_addr,
950 .dst_stride = dst_stride,
951 .flags = flags,
952 };
953
954 const char key[] = "nvk-meta-copy-query-pool-results";
955 const VkPushConstantRange push_range = {
956 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
957 .size = sizeof(push),
958 };
959 VkPipelineLayout layout;
960 result = vk_meta_get_pipeline_layout(&dev->vk, &dev->meta, NULL, &push_range,
961 key, sizeof(key), &layout);
962 if (result != VK_SUCCESS) {
963 vk_command_buffer_set_error(&cmd->vk, result);
964 return;
965 }
966
967 VkPipeline pipeline;
968 result = get_copy_queries_pipeline(dev, layout, &pipeline);
969 if (result != VK_SUCCESS) {
970 vk_command_buffer_set_error(&cmd->vk, result);
971 return;
972 }
973
974 /* Save pipeline and push constants */
975 struct nvk_shader *shader_save = cmd->state.cs.shader;
976 uint8_t push_save[NVK_MAX_PUSH_SIZE];
977 memcpy(push_save, desc->root.push, NVK_MAX_PUSH_SIZE);
978
979 dev->vk.dispatch_table.CmdBindPipeline(nvk_cmd_buffer_to_handle(cmd),
980 VK_PIPELINE_BIND_POINT_COMPUTE,
981 pipeline);
982
983 nvk_CmdPushConstants(nvk_cmd_buffer_to_handle(cmd), layout,
984 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push), &push);
985
986 nvk_CmdDispatchBase(nvk_cmd_buffer_to_handle(cmd), 0, 0, 0, 1, 1, 1);
987
988 /* Restore pipeline and push constants */
989 if (shader_save)
990 nvk_cmd_bind_compute_shader(cmd, shader_save);
991 memcpy(desc->root.push, push_save, NVK_MAX_PUSH_SIZE);
992 }
993
994 void
nvk_mme_copy_queries(struct mme_builder * b)995 nvk_mme_copy_queries(struct mme_builder *b)
996 {
997 if (b->devinfo->cls_eng3d < TURING_A)
998 return;
999
1000 struct mme_value64 dst_addr = mme_load_addr64(b);
1001 struct mme_value64 dst_stride = mme_load_addr64(b);
1002 struct mme_value64 avail_addr = mme_load_addr64(b);
1003 struct mme_value64 report_addr = mme_load_addr64(b);
1004
1005 struct mme_value query_count = mme_load(b);
1006 struct mme_value control = mme_load(b);
1007
1008 struct mme_value flags = control;
1009 struct mme_value write64 =
1010 mme_and(b, flags, mme_imm(VK_QUERY_RESULT_64_BIT));
1011 struct mme_value query_stride =
1012 mme_merge(b, mme_zero(), control, 0, 16, 8);
1013 struct mme_value is_timestamp =
1014 mme_merge(b, mme_zero(), control, 0, 1, 24);
1015
1016 mme_while(b, ugt, query_count, mme_zero()) {
1017 struct mme_value dw_per_query = mme_srl(b, query_stride, mme_imm(2));
1018 mme_tu104_read_fifoed(b, report_addr, dw_per_query);
1019 mme_free_reg(b, dw_per_query);
1020
1021 struct mme_value64 write_addr = mme_mov64(b, dst_addr);
1022 struct mme_value report_count = mme_srl(b, query_stride, mme_imm(4));
1023 mme_while(b, ugt, report_count, mme_zero()) {
1024 struct mme_value result_lo = mme_alloc_reg(b);
1025 struct mme_value result_hi = mme_alloc_reg(b);
1026 struct mme_value64 result = mme_value64(result_lo, result_hi);
1027
1028 mme_if(b, ine, is_timestamp, mme_zero()) {
1029 mme_load_to(b, mme_zero());
1030 mme_load_to(b, mme_zero());
1031 mme_load_to(b, result_lo);
1032 mme_load_to(b, result_hi);
1033 mme_sub_to(b, report_count, report_count, mme_imm(1));
1034 }
1035 mme_if(b, ieq, is_timestamp, mme_zero()) {
1036 struct mme_value begin_lo = mme_load(b);
1037 struct mme_value begin_hi = mme_load(b);
1038 struct mme_value64 begin = mme_value64(begin_lo, begin_hi);
1039 mme_load_to(b, mme_zero());
1040 mme_load_to(b, mme_zero());
1041
1042 struct mme_value end_lo = mme_load(b);
1043 struct mme_value end_hi = mme_load(b);
1044 struct mme_value64 end = mme_value64(end_lo, end_hi);
1045 mme_load_to(b, mme_zero());
1046 mme_load_to(b, mme_zero());
1047
1048 mme_sub64_to(b, result, end, begin);
1049 mme_sub_to(b, report_count, report_count, mme_imm(2));
1050
1051 mme_free_reg64(b, begin);
1052 mme_free_reg64(b, end);
1053 }
1054
1055 mme_store_global(b, write_addr, result_lo);
1056 mme_add64_to(b, write_addr, write_addr, mme_imm64(4));
1057 mme_if(b, ine, write64, mme_zero()) {
1058 mme_store_global(b, write_addr, result_hi);
1059 mme_add64_to(b, write_addr, write_addr, mme_imm64(4));
1060 }
1061 }
1062
1063 struct mme_value with_availability =
1064 mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
1065 mme_if(b, ine, with_availability, mme_zero()) {
1066 mme_tu104_read_fifoed(b, avail_addr, mme_imm(1));
1067 struct mme_value avail = mme_load(b);
1068 mme_store_global(b, write_addr, avail);
1069 mme_if(b, ine, write64, mme_zero()) {
1070 mme_add64_to(b, write_addr, write_addr, mme_imm64(4));
1071 mme_store_global(b, write_addr, mme_zero());
1072 }
1073 }
1074 mme_free_reg(b, with_availability);
1075
1076 mme_add64_to(b, avail_addr, avail_addr, mme_imm64(4));
1077
1078 mme_add64_to(b, report_addr, report_addr,
1079 mme_value64(query_stride, mme_zero()));
1080
1081 mme_add64_to(b, dst_addr, dst_addr, dst_stride);
1082
1083 mme_sub_to(b, query_count, query_count, mme_imm(1));
1084 }
1085 }
1086
1087 static void
nvk_cmd_copy_query_pool_results_mme(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t first_query,uint32_t query_count,uint64_t dst_addr,uint64_t dst_stride,VkQueryResultFlags flags)1088 nvk_cmd_copy_query_pool_results_mme(struct nvk_cmd_buffer *cmd,
1089 struct nvk_query_pool *pool,
1090 uint32_t first_query,
1091 uint32_t query_count,
1092 uint64_t dst_addr,
1093 uint64_t dst_stride,
1094 VkQueryResultFlags flags)
1095 {
1096 /* TODO: vkCmdCopyQueryPoolResults() with a compute shader */
1097 assert(nvk_cmd_buffer_device(cmd)->pdev->info.cls_eng3d >= TURING_A);
1098
1099 struct nv_push *p = nvk_cmd_buffer_push(cmd, 13);
1100 P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
1101 P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_COPY_QUERIES));
1102
1103 P_INLINE_DATA(p, dst_addr >> 32);
1104 P_INLINE_DATA(p, dst_addr);
1105 P_INLINE_DATA(p, dst_stride >> 32);
1106 P_INLINE_DATA(p, dst_stride);
1107
1108 uint64_t avail_start = nvk_query_available_addr(pool, first_query);
1109 P_INLINE_DATA(p, avail_start >> 32);
1110 P_INLINE_DATA(p, avail_start);
1111
1112 uint64_t report_start = nvk_query_report_addr(pool, first_query);
1113 P_INLINE_DATA(p, report_start >> 32);
1114 P_INLINE_DATA(p, report_start);
1115
1116 P_INLINE_DATA(p, query_count);
1117
1118 uint32_t is_timestamp = pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP;
1119
1120 uint32_t control = (flags & 0xff) |
1121 (pool->query_stride << 8) |
1122 (is_timestamp << 24);
1123 P_INLINE_DATA(p, control);
1124 }
1125
1126 VKAPI_ATTR void VKAPI_CALL
nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1127 nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
1128 VkQueryPool queryPool,
1129 uint32_t firstQuery,
1130 uint32_t queryCount,
1131 VkBuffer dstBuffer,
1132 VkDeviceSize dstOffset,
1133 VkDeviceSize stride,
1134 VkQueryResultFlags flags)
1135 {
1136 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1137 VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
1138 VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer);
1139
1140 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1141 for (uint32_t i = 0; i < queryCount; i++) {
1142 uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i);
1143
1144 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1145 __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
1146 P_NV906F_SEMAPHOREA(p, avail_addr >> 32);
1147 P_NV906F_SEMAPHOREB(p, (avail_addr & UINT32_MAX) >> 2);
1148 P_NV906F_SEMAPHOREC(p, 1);
1149 P_NV906F_SEMAPHORED(p, {
1150 .operation = OPERATION_ACQ_GEQ,
1151 .acquire_switch = ACQUIRE_SWITCH_ENABLED,
1152 .release_size = RELEASE_SIZE_4BYTE,
1153 });
1154 }
1155 }
1156
1157 uint64_t dst_addr = nvk_buffer_address(dst_buffer, dstOffset);
1158 nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
1159 dst_addr, stride, flags);
1160 }
1161