• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_query_pool.h"
6 
7 #include "nvk_buffer.h"
8 #include "nvk_cmd_buffer.h"
9 #include "nvk_device.h"
10 #include "nvk_entrypoints.h"
11 #include "nvk_event.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 
15 #include "vk_meta.h"
16 #include "vk_pipeline.h"
17 
18 #include "compiler/nir/nir.h"
19 #include "compiler/nir/nir_builder.h"
20 
21 #include "nouveau_bo.h"
22 #include "nouveau_context.h"
23 
24 #include "util/os_time.h"
25 
26 #include "nvk_cl906f.h"
27 #include "nvk_cl9097.h"
28 #include "nvk_cla0c0.h"
29 #include "nvk_clc597.h"
30 
31 struct nvk_query_report {
32    uint64_t value;
33    uint64_t timestamp;
34 };
35 
36 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateQueryPool(VkDevice device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)37 nvk_CreateQueryPool(VkDevice device,
38                     const VkQueryPoolCreateInfo *pCreateInfo,
39                     const VkAllocationCallbacks *pAllocator,
40                     VkQueryPool *pQueryPool)
41 {
42    VK_FROM_HANDLE(nvk_device, dev, device);
43    struct nvk_query_pool *pool;
44 
45    pool = vk_query_pool_create(&dev->vk, pCreateInfo,
46                                pAllocator, sizeof(*pool));
47    if (!pool)
48       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
49 
50    /* We place the availability first and then data */
51    pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
52                              sizeof(struct nvk_query_report));
53 
54    uint32_t reports_per_query;
55    switch (pCreateInfo->queryType) {
56    case VK_QUERY_TYPE_OCCLUSION:
57    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
58       reports_per_query = 2;
59       break;
60    case VK_QUERY_TYPE_TIMESTAMP:
61       reports_per_query = 1;
62       break;
63    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
64       reports_per_query = 2 * util_bitcount(pool->vk.pipeline_statistics);
65       break;
66    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
67       // 2 for primitives succeeded 2 for primitives needed
68       reports_per_query = 4;
69       break;
70    default:
71       unreachable("Unsupported query type");
72    }
73    pool->query_stride = reports_per_query * sizeof(struct nvk_query_report);
74 
75    if (pool->vk.query_count > 0) {
76       uint32_t bo_size = pool->query_start +
77                          pool->query_stride * pool->vk.query_count;
78       pool->bo = nouveau_ws_bo_new_mapped(dev->ws_dev, bo_size, 0,
79                                           NOUVEAU_WS_BO_GART |
80                                           NOUVEAU_WS_BO_NO_SHARE,
81                                           NOUVEAU_WS_BO_RDWR,
82                                           &pool->bo_map);
83       if (!pool->bo) {
84          vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
85          return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
86       }
87 
88       if (dev->ws_dev->debug_flags & NVK_DEBUG_ZERO_MEMORY)
89          memset(pool->bo_map, 0, bo_size);
90    }
91 
92    *pQueryPool = nvk_query_pool_to_handle(pool);
93 
94    return VK_SUCCESS;
95 }
96 
97 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyQueryPool(VkDevice device,VkQueryPool queryPool,const VkAllocationCallbacks * pAllocator)98 nvk_DestroyQueryPool(VkDevice device,
99                      VkQueryPool queryPool,
100                      const VkAllocationCallbacks *pAllocator)
101 {
102    VK_FROM_HANDLE(nvk_device, dev, device);
103    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
104 
105    if (!pool)
106       return;
107 
108    if (pool->bo) {
109       nouveau_ws_bo_unmap(pool->bo, pool->bo_map);
110       nouveau_ws_bo_destroy(pool->bo);
111    }
112    vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
113 }
114 
115 static uint64_t
nvk_query_available_addr(struct nvk_query_pool * pool,uint32_t query)116 nvk_query_available_addr(struct nvk_query_pool *pool, uint32_t query)
117 {
118    assert(query < pool->vk.query_count);
119    return pool->bo->offset + query * sizeof(uint32_t);
120 }
121 
122 static nir_def *
nvk_nir_available_addr(nir_builder * b,nir_def * pool_addr,nir_def * query)123 nvk_nir_available_addr(nir_builder *b, nir_def *pool_addr,
124                        nir_def *query)
125 {
126    nir_def *offset = nir_imul_imm(b, query, sizeof(uint32_t));
127    return nir_iadd(b, pool_addr, nir_u2u64(b, offset));
128 }
129 
130 static uint32_t *
nvk_query_available_map(struct nvk_query_pool * pool,uint32_t query)131 nvk_query_available_map(struct nvk_query_pool *pool, uint32_t query)
132 {
133    assert(query < pool->vk.query_count);
134    return (uint32_t *)pool->bo_map + query;
135 }
136 
137 static uint64_t
nvk_query_offset(struct nvk_query_pool * pool,uint32_t query)138 nvk_query_offset(struct nvk_query_pool *pool, uint32_t query)
139 {
140    assert(query < pool->vk.query_count);
141    return pool->query_start + query * pool->query_stride;
142 }
143 
144 static uint64_t
nvk_query_report_addr(struct nvk_query_pool * pool,uint32_t query)145 nvk_query_report_addr(struct nvk_query_pool *pool, uint32_t query)
146 {
147    return pool->bo->offset + nvk_query_offset(pool, query);
148 }
149 
150 static nir_def *
nvk_nir_query_report_addr(nir_builder * b,nir_def * pool_addr,nir_def * query_start,nir_def * query_stride,nir_def * query)151 nvk_nir_query_report_addr(nir_builder *b, nir_def *pool_addr,
152                           nir_def *query_start, nir_def *query_stride,
153                           nir_def *query)
154 {
155    nir_def *offset =
156       nir_iadd(b, query_start, nir_umul_2x32_64(b, query, query_stride));
157    return nir_iadd(b, pool_addr, offset);
158 }
159 
160 static struct nvk_query_report *
nvk_query_report_map(struct nvk_query_pool * pool,uint32_t query)161 nvk_query_report_map(struct nvk_query_pool *pool, uint32_t query)
162 {
163    return (void *)((char *)pool->bo_map + nvk_query_offset(pool, query));
164 }
165 
166 /**
167  * Goes through a series of consecutive query indices in the given pool,
168  * setting all element values to 0 and emitting them as available.
169  */
170 static void
emit_zero_queries(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t first_index,uint32_t num_queries)171 emit_zero_queries(struct nvk_cmd_buffer *cmd, struct nvk_query_pool *pool,
172                   uint32_t first_index, uint32_t num_queries)
173 {
174    switch (pool->vk.query_type) {
175    case VK_QUERY_TYPE_OCCLUSION:
176    case VK_QUERY_TYPE_TIMESTAMP:
177    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
178    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
179       for (uint32_t i = 0; i < num_queries; i++) {
180          uint64_t addr = nvk_query_available_addr(pool, first_index + i);
181 
182          struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
183          P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
184          P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
185          P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
186          P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
187          P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
188             .operation = OPERATION_RELEASE,
189             .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
190             .pipeline_location = PIPELINE_LOCATION_ALL,
191             .structure_size = STRUCTURE_SIZE_ONE_WORD,
192          });
193       }
194       break;
195    }
196    default:
197       unreachable("Unsupported query type");
198    }
199 }
200 
201 VKAPI_ATTR void VKAPI_CALL
nvk_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)202 nvk_ResetQueryPool(VkDevice device,
203                    VkQueryPool queryPool,
204                    uint32_t firstQuery,
205                    uint32_t queryCount)
206 {
207    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
208 
209    uint32_t *available = nvk_query_available_map(pool, firstQuery);
210    memset(available, 0, queryCount * sizeof(*available));
211 }
212 
213 VKAPI_ATTR void VKAPI_CALL
nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)214 nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
215                       VkQueryPool queryPool,
216                       uint32_t firstQuery,
217                       uint32_t queryCount)
218 {
219    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
220    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
221 
222    for (uint32_t i = 0; i < queryCount; i++) {
223       uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
224 
225       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
226       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
227       P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
228       P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
229       P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
230       P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
231          .operation = OPERATION_RELEASE,
232          .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
233          .pipeline_location = PIPELINE_LOCATION_ALL,
234          .structure_size = STRUCTURE_SIZE_ONE_WORD,
235       });
236    }
237 
238    /* Wait for the above writes to complete.  This prevents WaW hazards on any
239     * later query availability updates and ensures vkCmdCopyQueryPoolResults
240     * will see the query as unavailable if it happens before the query is
241     * completed again.
242     */
243    for (uint32_t i = 0; i < queryCount; i++) {
244       uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);
245 
246       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
247       __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
248       P_NV906F_SEMAPHOREA(p, addr >> 32);
249       P_NV906F_SEMAPHOREB(p, (addr & UINT32_MAX) >> 2);
250       P_NV906F_SEMAPHOREC(p, 0);
251       P_NV906F_SEMAPHORED(p, {
252          .operation = OPERATION_ACQUIRE,
253          .acquire_switch = ACQUIRE_SWITCH_ENABLED,
254          .release_size = RELEASE_SIZE_4BYTE,
255       });
256    }
257 }
258 
259 VKAPI_ATTR void VKAPI_CALL
nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)260 nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
261                        VkPipelineStageFlags2 stage,
262                        VkQueryPool queryPool,
263                        uint32_t query)
264 {
265    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
266    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
267 
268    struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
269 
270    uint64_t report_addr = nvk_query_report_addr(pool, query);
271    P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
272    P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
273    P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
274    P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
275    P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
276       .operation = OPERATION_REPORT_ONLY,
277       .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
278       .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
279    });
280 
281    uint64_t available_addr = nvk_query_available_addr(pool, query);
282    P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
283    P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
284    P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
285    P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
286    P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
287       .operation = OPERATION_RELEASE,
288       .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
289       .pipeline_location = PIPELINE_LOCATION_ALL,
290       .structure_size = STRUCTURE_SIZE_ONE_WORD,
291    });
292 
293    /* From the Vulkan spec:
294     *
295     *   "If vkCmdWriteTimestamp2 is called while executing a render pass
296     *    instance that has multiview enabled, the timestamp uses N consecutive
297     *    query indices in the query pool (starting at query) where N is the
298     *    number of bits set in the view mask of the subpass the command is
299     *    executed in. The resulting query values are determined by an
300     *    implementation-dependent choice of one of the following behaviors:"
301     *
302     * In our case, only the first query is used, so we emit zeros for the
303     * remaining queries, as described in the first behavior listed in the
304     * Vulkan spec:
305     *
306     *   "The first query is a timestamp value and (if more than one bit is set
307     *   in the view mask) zero is written to the remaining queries."
308     */
309    if (cmd->state.gfx.render.view_mask != 0) {
310       const uint32_t num_queries =
311          util_bitcount(cmd->state.gfx.render.view_mask);
312       if (num_queries > 1)
313          emit_zero_queries(cmd, pool, query + 1, num_queries - 1);
314    }
315 }
316 
317 struct nvk_3d_stat_query {
318    VkQueryPipelineStatisticFlagBits flag;
319    uint8_t loc;
320    uint8_t report;
321 };
322 
323 /* This must remain sorted in flag order */
324 static const struct nvk_3d_stat_query nvk_3d_stat_queries[] = {{
325    .flag    = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT,
326    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
327    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED,
328 }, {
329    .flag    = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
330    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
331    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED,
332 }, {
333    .flag    = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT,
334    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER,
335    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS,
336 }, {
337    .flag    = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT,
338    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
339    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS,
340 }, {
341    .flag    = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT,
342    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
343    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED,
344 }, {
345    .flag    = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
346    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
347    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS,
348 }, {
349    .flag    = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
350    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
351    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED,
352 }, {
353    .flag    = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
354    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER,
355    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS,
356 }, {
357    .flag    = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
358    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER,
359    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS,
360 }, {
361    .flag    = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
362    .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER,
363    .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS,
364 }, {
365    .flag    = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT,
366    .loc     = UINT8_MAX,
367    .report  = UINT8_MAX,
368 }};
369 
370 static void
mme_store_global(struct mme_builder * b,struct mme_value64 addr,struct mme_value v)371 mme_store_global(struct mme_builder *b,
372                  struct mme_value64 addr,
373                  struct mme_value v)
374 {
375    mme_mthd(b, NV9097_SET_REPORT_SEMAPHORE_A);
376    mme_emit_addr64(b, addr);
377    mme_emit(b, v);
378    mme_emit(b, mme_imm(0x10000000));
379 }
380 
381 void
nvk_mme_write_cs_invocations(struct mme_builder * b)382 nvk_mme_write_cs_invocations(struct mme_builder *b)
383 {
384    struct mme_value64 dst_addr = mme_load_addr64(b);
385 
386    struct mme_value accum_hi = mme_state(b,
387       NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI));
388    struct mme_value accum_lo = mme_state(b,
389       NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO));
390    struct mme_value64 accum = mme_value64(accum_lo, accum_hi);
391 
392    mme_store_global(b, dst_addr, accum.lo);
393    mme_store_global(b, mme_add64(b, dst_addr, mme_imm64(4)), accum.hi);
394 }
395 
396 static void
nvk_cmd_begin_end_query(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t query,uint32_t index,bool end)397 nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
398                         struct nvk_query_pool *pool,
399                         uint32_t query, uint32_t index,
400                         bool end)
401 {
402    uint64_t report_addr = nvk_query_report_addr(pool, query) +
403                           end * sizeof(struct nvk_query_report);
404 
405    uint32_t end_size = 7 * end;
406 
407    struct nv_push *p;
408    switch (pool->vk.query_type) {
409    case VK_QUERY_TYPE_OCCLUSION:
410       p = nvk_cmd_buffer_push(cmd, 7 + end_size);
411 
412       P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, !end);
413 
414       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
415       P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
416       P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
417       P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
418       P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
419          .operation = OPERATION_REPORT_ONLY,
420          .pipeline_location = PIPELINE_LOCATION_ALL,
421          .report = REPORT_ZPASS_PIXEL_CNT64,
422          .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
423          .flush_disable = true,
424       });
425       break;
426 
427    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
428       uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
429       p = nvk_cmd_buffer_push(cmd, stat_count * 5 + end_size);
430 
431       ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics;
432       for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) {
433          const struct nvk_3d_stat_query *sq = &nvk_3d_stat_queries[i];
434          if (!(stats_left & sq->flag))
435             continue;
436 
437          /* The 3D stat queries array MUST be sorted */
438          assert(!(stats_left & (sq->flag - 1)));
439 
440          if (sq->flag == VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT) {
441             P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
442             P_INLINE_DATA(p, report_addr >> 32);
443             P_INLINE_DATA(p, report_addr);
444          } else {
445             P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
446             P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
447             P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
448             P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
449             P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
450                .operation = OPERATION_REPORT_ONLY,
451                .pipeline_location = sq->loc,
452                .report = sq->report,
453                .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
454                .flush_disable = true,
455             });
456          }
457 
458          report_addr += 2 * sizeof(struct nvk_query_report);
459          stats_left &= ~sq->flag;
460       }
461       break;
462    }
463 
464    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
465       const uint32_t xfb_reports[] = {
466          NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_SUCCEEDED,
467          NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_NEEDED,
468       };
469       p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports) + end_size);
470       for (uint32_t i = 0; i < ARRAY_SIZE(xfb_reports); ++i) {
471          P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
472          P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
473          P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
474          P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
475          P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
476                .operation = OPERATION_REPORT_ONLY,
477                .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
478                .report = xfb_reports[i],
479                .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
480                .sub_report = index,
481                .flush_disable = true,
482                });
483          report_addr += 2 * sizeof(struct nvk_query_report);
484       }
485       break;
486    }
487 
488    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
489       p = nvk_cmd_buffer_push(cmd, 5 + end_size);
490 
491       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
492       P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
493       P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
494       P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
495       P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
496          .operation = OPERATION_REPORT_ONLY,
497          .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
498          .report = REPORT_VTG_PRIMITIVES_OUT,
499          .sub_report = index,
500          .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
501          .flush_disable = true,
502       });
503       break;
504 
505    default:
506       unreachable("Unsupported query type");
507    }
508 
509    if (end) {
510       P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);
511 
512       uint64_t available_addr = nvk_query_available_addr(pool, query);
513       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
514       P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
515       P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
516       P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
517       P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
518          .operation = OPERATION_RELEASE,
519          .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
520          .pipeline_location = PIPELINE_LOCATION_ALL,
521          .structure_size = STRUCTURE_SIZE_ONE_WORD,
522       });
523    }
524 }
525 
526 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)527 nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
528                             VkQueryPool queryPool,
529                             uint32_t query,
530                             VkQueryControlFlags flags,
531                             uint32_t index)
532 {
533    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
534    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
535 
536    nvk_cmd_begin_end_query(cmd, pool, query, index, false);
537 }
538 
539 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)540 nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
541                           VkQueryPool queryPool,
542                           uint32_t query,
543                           uint32_t index)
544 {
545    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
546    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
547 
548    nvk_cmd_begin_end_query(cmd, pool, query, index, true);
549 
550    /* From the Vulkan spec:
551     *
552     *   "If queries are used while executing a render pass instance that has
553     *    multiview enabled, the query uses N consecutive query indices in
554     *    the query pool (starting at query) where N is the number of bits set
555     *    in the view mask in the subpass the query is used in. How the
556     *    numerical results of the query are distributed among the queries is
557     *    implementation-dependent."
558     *
559     * In our case, only the first query is used, so we emit zeros for the
560     * remaining queries.
561     */
562    if (cmd->state.gfx.render.view_mask != 0) {
563       const uint32_t num_queries =
564          util_bitcount(cmd->state.gfx.render.view_mask);
565       if (num_queries > 1)
566          emit_zero_queries(cmd, pool, query + 1, num_queries - 1);
567    }
568 }
569 
570 static bool
nvk_query_is_available(struct nvk_query_pool * pool,uint32_t query)571 nvk_query_is_available(struct nvk_query_pool *pool, uint32_t query)
572 {
573    uint32_t *available = nvk_query_available_map(pool, query);
574    return p_atomic_read(available) != 0;
575 }
576 
577 #define NVK_QUERY_TIMEOUT 2000000000ull
578 
579 static VkResult
nvk_query_wait_for_available(struct nvk_device * dev,struct nvk_query_pool * pool,uint32_t query)580 nvk_query_wait_for_available(struct nvk_device *dev,
581                              struct nvk_query_pool *pool,
582                              uint32_t query)
583 {
584    uint64_t abs_timeout_ns = os_time_get_absolute_timeout(NVK_QUERY_TIMEOUT);
585 
586    while (os_time_get_nano() < abs_timeout_ns) {
587       if (nvk_query_is_available(pool, query))
588          return VK_SUCCESS;
589 
590       VkResult status = vk_device_check_status(&dev->vk);
591       if (status != VK_SUCCESS)
592          return status;
593    }
594 
595    return vk_device_set_lost(&dev->vk, "query timeout");
596 }
597 
598 static void
cpu_write_query_result(void * dst,uint32_t idx,VkQueryResultFlags flags,uint64_t result)599 cpu_write_query_result(void *dst, uint32_t idx,
600                        VkQueryResultFlags flags,
601                        uint64_t result)
602 {
603    if (flags & VK_QUERY_RESULT_64_BIT) {
604       uint64_t *dst64 = dst;
605       dst64[idx] = result;
606    } else {
607       uint32_t *dst32 = dst;
608       dst32[idx] = result;
609    }
610 }
611 
612 static void
cpu_get_query_delta(void * dst,const struct nvk_query_report * src,uint32_t idx,VkQueryResultFlags flags)613 cpu_get_query_delta(void *dst, const struct nvk_query_report *src,
614                     uint32_t idx, VkQueryResultFlags flags)
615 {
616    uint64_t delta = src[idx * 2 + 1].value - src[idx * 2].value;
617    cpu_write_query_result(dst, idx, flags, delta);
618 }
619 
620 VKAPI_ATTR VkResult VKAPI_CALL
nvk_GetQueryPoolResults(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)621 nvk_GetQueryPoolResults(VkDevice device,
622                         VkQueryPool queryPool,
623                         uint32_t firstQuery,
624                         uint32_t queryCount,
625                         size_t dataSize,
626                         void *pData,
627                         VkDeviceSize stride,
628                         VkQueryResultFlags flags)
629 {
630    VK_FROM_HANDLE(nvk_device, dev, device);
631    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
632 
633    if (vk_device_is_lost(&dev->vk))
634       return VK_ERROR_DEVICE_LOST;
635 
636    VkResult status = VK_SUCCESS;
637    for (uint32_t i = 0; i < queryCount; i++) {
638       const uint32_t query = firstQuery + i;
639 
640       bool available = nvk_query_is_available(pool, query);
641 
642       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
643          status = nvk_query_wait_for_available(dev, pool, query);
644          if (status != VK_SUCCESS)
645             return status;
646 
647          available = true;
648       }
649 
650       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
651 
652       const struct nvk_query_report *src = nvk_query_report_map(pool, query);
653       assert(i * stride < dataSize);
654       void *dst = (char *)pData + i * stride;
655 
656       uint32_t available_dst_idx = 1;
657       switch (pool->vk.query_type) {
658       case VK_QUERY_TYPE_OCCLUSION:
659       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
660          if (write_results)
661             cpu_get_query_delta(dst, src, 0, flags);
662          break;
663       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
664          uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
665          available_dst_idx = stat_count;
666          if (write_results) {
667             for (uint32_t j = 0; j < stat_count; j++)
668                cpu_get_query_delta(dst, src, j, flags);
669          }
670          break;
671       }
672       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
673          const int prims_succeeded_idx = 0;
674          const int prims_needed_idx = 1;
675          available_dst_idx = 2;
676          if (write_results) {
677             cpu_get_query_delta(dst, src, prims_succeeded_idx, flags);
678             cpu_get_query_delta(dst, src, prims_needed_idx, flags);
679          }
680          break;
681       }
682       case VK_QUERY_TYPE_TIMESTAMP:
683          if (write_results)
684             cpu_write_query_result(dst, 0, flags, src->timestamp);
685          break;
686       default:
687          unreachable("Unsupported query type");
688       }
689 
690       if (!write_results)
691          status = VK_NOT_READY;
692 
693       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
694          cpu_write_query_result(dst, available_dst_idx, flags, available);
695    }
696 
697    return status;
698 }
699 
700 struct nvk_copy_query_push {
701    uint64_t pool_addr;
702    uint32_t query_start;
703    uint32_t query_stride;
704    uint32_t first_query;
705    uint32_t query_count;
706    uint64_t dst_addr;
707    uint64_t dst_stride;
708    uint32_t flags;
709 };
710 
711 static nir_def *
load_struct_var(nir_builder * b,nir_variable * var,uint32_t field)712 load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
713 {
714    nir_deref_instr *deref =
715       nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
716    return nir_load_deref(b, deref);
717 }
718 
719 static void
nir_write_query_result(nir_builder * b,nir_def * dst_addr,nir_def * idx,nir_def * flags,nir_def * result)720 nir_write_query_result(nir_builder *b, nir_def *dst_addr,
721                        nir_def *idx, nir_def *flags,
722                        nir_def *result)
723 {
724    assert(result->num_components == 1);
725    assert(result->bit_size == 64);
726 
727    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
728    {
729       nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 8));
730       nir_store_global(b, nir_iadd(b, dst_addr, offset), 8, result, 0x1);
731    }
732    nir_push_else(b, NULL);
733    {
734       nir_def *result32 = nir_u2u32(b, result);
735       nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 4));
736       nir_store_global(b, nir_iadd(b, dst_addr, offset), 4, result32, 0x1);
737    }
738    nir_pop_if(b, NULL);
739 }
740 
741 static void
nir_get_query_delta(nir_builder * b,nir_def * dst_addr,nir_def * report_addr,nir_def * idx,nir_def * flags)742 nir_get_query_delta(nir_builder *b, nir_def *dst_addr,
743                     nir_def *report_addr, nir_def *idx,
744                     nir_def *flags)
745 {
746    nir_def *offset =
747       nir_imul_imm(b, idx, 2 * sizeof(struct nvk_query_report));
748    nir_def *begin_addr =
749       nir_iadd(b, report_addr, nir_i2i64(b, offset));
750    nir_def *end_addr =
751       nir_iadd_imm(b, begin_addr, sizeof(struct nvk_query_report));
752 
753    /* nvk_query_report::timestamp is the first uint64_t */
754    nir_def *begin = nir_load_global(b, begin_addr, 16, 1, 64);
755    nir_def *end = nir_load_global(b, end_addr, 16, 1, 64);
756 
757    nir_def *delta = nir_isub(b, end, begin);
758 
759    nir_write_query_result(b, dst_addr, idx, flags, delta);
760 }
761 
762 static void
nvk_nir_copy_query(nir_builder * b,nir_variable * push,nir_def * i)763 nvk_nir_copy_query(nir_builder *b, nir_variable *push, nir_def *i)
764 {
765    nir_def *pool_addr = load_struct_var(b, push, 0);
766    nir_def *query_start = nir_u2u64(b, load_struct_var(b, push, 1));
767    nir_def *query_stride = load_struct_var(b, push, 2);
768    nir_def *first_query = load_struct_var(b, push, 3);
769    nir_def *dst_addr = load_struct_var(b, push, 5);
770    nir_def *dst_stride = load_struct_var(b, push, 6);
771    nir_def *flags = load_struct_var(b, push, 7);
772 
773    nir_def *query = nir_iadd(b, first_query, i);
774 
775    nir_def *avail_addr = nvk_nir_available_addr(b, pool_addr, query);
776    nir_def *available =
777       nir_i2b(b, nir_load_global(b, avail_addr, 4, 1, 32));
778 
779    nir_def *partial = nir_test_mask(b, flags, VK_QUERY_RESULT_PARTIAL_BIT);
780    nir_def *write_results = nir_ior(b, available, partial);
781 
782    nir_def *report_addr =
783       nvk_nir_query_report_addr(b, pool_addr, query_start, query_stride,
784                                 query);
785    nir_def *dst_offset = nir_imul(b, nir_u2u64(b, i), dst_stride);
786 
787    /* Timestamp queries are the only ones use a single report */
788    nir_def *is_timestamp =
789       nir_ieq_imm(b, query_stride, sizeof(struct nvk_query_report));
790 
791    nir_def *one = nir_imm_int(b, 1);
792    nir_def *num_reports;
793    nir_push_if(b, is_timestamp);
794    {
795       nir_push_if(b, write_results);
796       {
797          /* This is the timestamp case.  We add 8 because we're loading
798           * nvk_query_report::timestamp.
799           */
800          nir_def *timestamp =
801             nir_load_global(b, nir_iadd_imm(b, report_addr, 8), 8, 1, 64);
802 
803          nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
804                                 nir_imm_int(b, 0), flags, timestamp);
805       }
806       nir_pop_if(b, NULL);
807    }
808    nir_push_else(b, NULL);
809    {
810       /* Everything that isn't a timestamp has the invariant that the
811        * number of destination entries is equal to the query stride divided
812        * by the size of two reports.
813        */
814       num_reports = nir_udiv_imm(b, query_stride,
815                                  2 * sizeof(struct nvk_query_report));
816 
817       nir_push_if(b, write_results);
818       {
819          nir_variable *r =
820             nir_local_variable_create(b->impl, glsl_uint_type(), "r");
821          nir_store_var(b, r, nir_imm_int(b, 0), 0x1);
822 
823          nir_push_loop(b);
824          {
825             nir_push_if(b, nir_ige(b, nir_load_var(b, r), num_reports));
826             {
827                nir_jump(b, nir_jump_break);
828             }
829             nir_pop_if(b, NULL);
830 
831             nir_get_query_delta(b, nir_iadd(b, dst_addr, dst_offset),
832                                 report_addr, nir_load_var(b, r), flags);
833 
834             nir_store_var(b, r, nir_iadd_imm(b, nir_load_var(b, r), 1), 0x1);
835          }
836          nir_pop_loop(b, NULL);
837       }
838       nir_pop_if(b, NULL);
839    }
840    nir_pop_if(b, NULL);
841 
842    num_reports = nir_if_phi(b, one, num_reports);
843 
844    nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
845    {
846       nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
847                              num_reports, flags, nir_b2i64(b, available));
848    }
849    nir_pop_if(b, NULL);
850 }
851 
852 static nir_shader *
build_copy_queries_shader(void)853 build_copy_queries_shader(void)
854 {
855    nir_builder build =
856       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
857                                      "nvk-meta-copy-queries");
858    nir_builder *b = &build;
859 
860    struct glsl_struct_field push_fields[] = {
861       { .type = glsl_uint64_t_type(), .name = "pool_addr", .offset = 0 },
862       { .type = glsl_uint_type(), .name = "query_start", .offset = 8 },
863       { .type = glsl_uint_type(), .name = "query_stride", .offset = 12 },
864       { .type = glsl_uint_type(), .name = "first_query", .offset = 16 },
865       { .type = glsl_uint_type(), .name = "query_count", .offset = 20 },
866       { .type = glsl_uint64_t_type(), .name = "dst_addr", .offset = 24 },
867       { .type = glsl_uint64_t_type(), .name = "dst_stride", .offset = 32 },
868       { .type = glsl_uint_type(), .name = "flags", .offset = 40 },
869    };
870    const struct glsl_type *push_iface_type =
871       glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
872                           GLSL_INTERFACE_PACKING_STD140,
873                           false /* row_major */, "push");
874    nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
875                                             push_iface_type, "push");
876 
877    nir_def *query_count = load_struct_var(b, push, 4);
878 
879    nir_variable *i = nir_local_variable_create(b->impl, glsl_uint_type(), "i");
880    nir_store_var(b, i, nir_imm_int(b, 0), 0x1);
881 
882    nir_push_loop(b);
883    {
884       nir_push_if(b, nir_ige(b, nir_load_var(b, i), query_count));
885       {
886          nir_jump(b, nir_jump_break);
887       }
888       nir_pop_if(b, NULL);
889 
890       nvk_nir_copy_query(b, push, nir_load_var(b, i));
891 
892       nir_store_var(b, i, nir_iadd_imm(b, nir_load_var(b, i), 1), 0x1);
893    }
894    nir_pop_loop(b, NULL);
895 
896    return build.shader;
897 }
898 
899 static VkResult
get_copy_queries_pipeline(struct nvk_device * dev,VkPipelineLayout layout,VkPipeline * pipeline_out)900 get_copy_queries_pipeline(struct nvk_device *dev,
901                           VkPipelineLayout layout,
902                           VkPipeline *pipeline_out)
903 {
904    const char key[] = "nvk-meta-copy-query-pool-results";
905    VkPipeline cached = vk_meta_lookup_pipeline(&dev->meta, key, sizeof(key));
906    if (cached != VK_NULL_HANDLE) {
907       *pipeline_out = cached;
908       return VK_SUCCESS;
909    }
910 
911    const VkPipelineShaderStageNirCreateInfoMESA nir_info = {
912       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_NIR_CREATE_INFO_MESA,
913       .nir = build_copy_queries_shader(),
914    };
915    const VkComputePipelineCreateInfo info = {
916       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
917       .stage = {
918          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
919          .pNext = &nir_info,
920          .stage = VK_SHADER_STAGE_COMPUTE_BIT,
921          .pName = "main",
922       },
923       .layout = layout,
924    };
925 
926    return vk_meta_create_compute_pipeline(&dev->vk, &dev->meta, &info,
927                                           key, sizeof(key), pipeline_out);
928 }
929 
930 static void
nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t first_query,uint32_t query_count,uint64_t dst_addr,uint64_t dst_stride,VkQueryResultFlags flags)931 nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer *cmd,
932                                  struct nvk_query_pool *pool,
933                                  uint32_t first_query,
934                                  uint32_t query_count,
935                                  uint64_t dst_addr,
936                                  uint64_t dst_stride,
937                                  VkQueryResultFlags flags)
938 {
939    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
940    struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
941    VkResult result;
942 
943    const struct nvk_copy_query_push push = {
944       .pool_addr = pool->bo->offset,
945       .query_start = pool->query_start,
946       .query_stride = pool->query_stride,
947       .first_query = first_query,
948       .query_count = query_count,
949       .dst_addr = dst_addr,
950       .dst_stride = dst_stride,
951       .flags = flags,
952    };
953 
954    const char key[] = "nvk-meta-copy-query-pool-results";
955    const VkPushConstantRange push_range = {
956       .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
957       .size = sizeof(push),
958    };
959    VkPipelineLayout layout;
960    result = vk_meta_get_pipeline_layout(&dev->vk, &dev->meta, NULL, &push_range,
961                                         key, sizeof(key), &layout);
962    if (result != VK_SUCCESS) {
963       vk_command_buffer_set_error(&cmd->vk, result);
964       return;
965    }
966 
967    VkPipeline pipeline;
968    result = get_copy_queries_pipeline(dev, layout, &pipeline);
969    if (result != VK_SUCCESS) {
970       vk_command_buffer_set_error(&cmd->vk, result);
971       return;
972    }
973 
974    /* Save pipeline and push constants */
975    struct nvk_shader *shader_save = cmd->state.cs.shader;
976    uint8_t push_save[NVK_MAX_PUSH_SIZE];
977    memcpy(push_save, desc->root.push, NVK_MAX_PUSH_SIZE);
978 
979    dev->vk.dispatch_table.CmdBindPipeline(nvk_cmd_buffer_to_handle(cmd),
980                                           VK_PIPELINE_BIND_POINT_COMPUTE,
981                                           pipeline);
982 
983    nvk_CmdPushConstants(nvk_cmd_buffer_to_handle(cmd), layout,
984                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push), &push);
985 
986    nvk_CmdDispatchBase(nvk_cmd_buffer_to_handle(cmd), 0, 0, 0, 1, 1, 1);
987 
988    /* Restore pipeline and push constants */
989    if (shader_save)
990       nvk_cmd_bind_compute_shader(cmd, shader_save);
991    memcpy(desc->root.push, push_save, NVK_MAX_PUSH_SIZE);
992 }
993 
994 void
nvk_mme_copy_queries(struct mme_builder * b)995 nvk_mme_copy_queries(struct mme_builder *b)
996 {
997    if (b->devinfo->cls_eng3d < TURING_A)
998       return;
999 
1000    struct mme_value64 dst_addr = mme_load_addr64(b);
1001    struct mme_value64 dst_stride = mme_load_addr64(b);
1002    struct mme_value64 avail_addr = mme_load_addr64(b);
1003    struct mme_value64 report_addr = mme_load_addr64(b);
1004 
1005    struct mme_value query_count = mme_load(b);
1006    struct mme_value control = mme_load(b);
1007 
1008    struct mme_value flags = control;
1009    struct mme_value write64 =
1010       mme_and(b, flags, mme_imm(VK_QUERY_RESULT_64_BIT));
1011    struct mme_value query_stride =
1012       mme_merge(b, mme_zero(), control, 0, 16, 8);
1013    struct mme_value is_timestamp =
1014       mme_merge(b, mme_zero(), control, 0, 1, 24);
1015 
1016    mme_while(b, ugt, query_count, mme_zero()) {
1017       struct mme_value dw_per_query = mme_srl(b, query_stride, mme_imm(2));
1018       mme_tu104_read_fifoed(b, report_addr, dw_per_query);
1019       mme_free_reg(b, dw_per_query);
1020 
1021       struct mme_value64 write_addr = mme_mov64(b, dst_addr);
1022       struct mme_value report_count = mme_srl(b, query_stride, mme_imm(4));
1023       mme_while(b, ugt, report_count, mme_zero()) {
1024          struct mme_value result_lo = mme_alloc_reg(b);
1025          struct mme_value result_hi = mme_alloc_reg(b);
1026          struct mme_value64 result = mme_value64(result_lo, result_hi);
1027 
1028          mme_if(b, ine, is_timestamp, mme_zero()) {
1029             mme_load_to(b, mme_zero());
1030             mme_load_to(b, mme_zero());
1031             mme_load_to(b, result_lo);
1032             mme_load_to(b, result_hi);
1033             mme_sub_to(b, report_count, report_count, mme_imm(1));
1034          }
1035          mme_if(b, ieq, is_timestamp, mme_zero()) {
1036             struct mme_value begin_lo = mme_load(b);
1037             struct mme_value begin_hi = mme_load(b);
1038             struct mme_value64 begin = mme_value64(begin_lo, begin_hi);
1039             mme_load_to(b, mme_zero());
1040             mme_load_to(b, mme_zero());
1041 
1042             struct mme_value end_lo = mme_load(b);
1043             struct mme_value end_hi = mme_load(b);
1044             struct mme_value64 end = mme_value64(end_lo, end_hi);
1045             mme_load_to(b, mme_zero());
1046             mme_load_to(b, mme_zero());
1047 
1048             mme_sub64_to(b, result, end, begin);
1049             mme_sub_to(b, report_count, report_count, mme_imm(2));
1050 
1051             mme_free_reg64(b, begin);
1052             mme_free_reg64(b, end);
1053          }
1054 
1055          mme_store_global(b, write_addr, result_lo);
1056          mme_add64_to(b, write_addr, write_addr, mme_imm64(4));
1057          mme_if(b, ine, write64, mme_zero()) {
1058             mme_store_global(b, write_addr, result_hi);
1059             mme_add64_to(b, write_addr, write_addr, mme_imm64(4));
1060          }
1061       }
1062 
1063       struct mme_value with_availability =
1064          mme_and(b, flags, mme_imm(VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
1065       mme_if(b, ine, with_availability, mme_zero()) {
1066          mme_tu104_read_fifoed(b, avail_addr, mme_imm(1));
1067          struct mme_value avail = mme_load(b);
1068          mme_store_global(b, write_addr, avail);
1069          mme_if(b, ine, write64, mme_zero()) {
1070             mme_add64_to(b, write_addr, write_addr, mme_imm64(4));
1071             mme_store_global(b, write_addr, mme_zero());
1072          }
1073       }
1074       mme_free_reg(b, with_availability);
1075 
1076       mme_add64_to(b, avail_addr, avail_addr, mme_imm64(4));
1077 
1078       mme_add64_to(b, report_addr, report_addr,
1079                    mme_value64(query_stride, mme_zero()));
1080 
1081       mme_add64_to(b, dst_addr, dst_addr, dst_stride);
1082 
1083       mme_sub_to(b, query_count, query_count, mme_imm(1));
1084    }
1085 }
1086 
1087 static void
nvk_cmd_copy_query_pool_results_mme(struct nvk_cmd_buffer * cmd,struct nvk_query_pool * pool,uint32_t first_query,uint32_t query_count,uint64_t dst_addr,uint64_t dst_stride,VkQueryResultFlags flags)1088 nvk_cmd_copy_query_pool_results_mme(struct nvk_cmd_buffer *cmd,
1089                                     struct nvk_query_pool *pool,
1090                                     uint32_t first_query,
1091                                     uint32_t query_count,
1092                                     uint64_t dst_addr,
1093                                     uint64_t dst_stride,
1094                                     VkQueryResultFlags flags)
1095 {
1096    /* TODO: vkCmdCopyQueryPoolResults() with a compute shader */
1097    assert(nvk_cmd_buffer_device(cmd)->pdev->info.cls_eng3d >= TURING_A);
1098 
1099    struct nv_push *p = nvk_cmd_buffer_push(cmd, 13);
1100    P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
1101    P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_COPY_QUERIES));
1102 
1103    P_INLINE_DATA(p, dst_addr >> 32);
1104    P_INLINE_DATA(p, dst_addr);
1105    P_INLINE_DATA(p, dst_stride >> 32);
1106    P_INLINE_DATA(p, dst_stride);
1107 
1108    uint64_t avail_start = nvk_query_available_addr(pool, first_query);
1109    P_INLINE_DATA(p, avail_start >> 32);
1110    P_INLINE_DATA(p, avail_start);
1111 
1112    uint64_t report_start = nvk_query_report_addr(pool, first_query);
1113    P_INLINE_DATA(p, report_start >> 32);
1114    P_INLINE_DATA(p, report_start);
1115 
1116    P_INLINE_DATA(p, query_count);
1117 
1118    uint32_t is_timestamp = pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP;
1119 
1120    uint32_t control = (flags & 0xff) |
1121                       (pool->query_stride << 8) |
1122                       (is_timestamp << 24);
1123    P_INLINE_DATA(p, control);
1124 }
1125 
1126 VKAPI_ATTR void VKAPI_CALL
nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1127 nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
1128                             VkQueryPool queryPool,
1129                             uint32_t firstQuery,
1130                             uint32_t queryCount,
1131                             VkBuffer dstBuffer,
1132                             VkDeviceSize dstOffset,
1133                             VkDeviceSize stride,
1134                             VkQueryResultFlags flags)
1135 {
1136    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1137    VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
1138    VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer);
1139 
1140    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1141       for (uint32_t i = 0; i < queryCount; i++) {
1142          uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i);
1143 
1144          struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1145          __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
1146          P_NV906F_SEMAPHOREA(p, avail_addr >> 32);
1147          P_NV906F_SEMAPHOREB(p, (avail_addr & UINT32_MAX) >> 2);
1148          P_NV906F_SEMAPHOREC(p, 1);
1149          P_NV906F_SEMAPHORED(p, {
1150             .operation = OPERATION_ACQ_GEQ,
1151             .acquire_switch = ACQUIRE_SWITCH_ENABLED,
1152             .release_size = RELEASE_SIZE_4BYTE,
1153          });
1154       }
1155    }
1156 
1157    uint64_t dst_addr = nvk_buffer_address(dst_buffer, dstOffset);
1158    nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
1159                                     dst_addr, stride, flags);
1160 }
1161