• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  *
5  * Based on anv:
6  * Copyright © 2015 Intel Corporation
7  */
8 
9 #include "tu_query_pool.h"
10 
11 #include <fcntl.h>
12 
13 #include "nir/nir_builder.h"
14 #include "util/os_time.h"
15 
16 #include "vk_acceleration_structure.h"
17 #include "vk_util.h"
18 
19 #include "tu_buffer.h"
20 #include "bvh/tu_build_interface.h"
21 #include "tu_cmd_buffer.h"
22 #include "tu_cs.h"
23 #include "tu_device.h"
24 #include "tu_rmv.h"
25 
26 #include "common/freedreno_gpu_event.h"
27 
28 #define NSEC_PER_SEC 1000000000ull
29 #define WAIT_TIMEOUT 5
30 #define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
31 
32 struct PACKED query_slot {
33    uint64_t available;
34 };
35 
36 struct PACKED occlusion_query_slot {
37    struct query_slot common;
38    uint64_t _padding0;
39 
40    uint64_t begin;
41    uint64_t result;
42    uint64_t end;
43    uint64_t _padding1;
44 };
45 
46 struct PACKED timestamp_query_slot {
47    struct query_slot common;
48    uint64_t result;
49 };
50 
51 struct PACKED primitive_slot_value {
52    uint64_t values[2];
53 };
54 
55 struct PACKED pipeline_stat_query_slot {
56    struct query_slot common;
57    uint64_t results[STAT_COUNT];
58 
59    uint64_t begin[STAT_COUNT];
60    uint64_t end[STAT_COUNT];
61 };
62 
63 struct PACKED primitive_query_slot {
64    struct query_slot common;
65    /* The result of transform feedback queries is two integer values:
66     *   results[0] is the count of primitives written,
67     *   results[1] is the count of primitives generated.
68     * Also a result for each stream is stored at 4 slots respectively.
69     */
70    uint64_t results[2];
71 
72    /* Primitive counters also need to be 16-byte aligned. */
73    uint64_t _padding;
74 
75    struct primitive_slot_value begin[4];
76    struct primitive_slot_value end[4];
77 };
78 
79 struct PACKED perfcntr_query_slot {
80    uint64_t result;
81    uint64_t begin;
82    uint64_t end;
83 };
84 
85 struct PACKED perf_query_slot {
86    struct query_slot common;
87    struct perfcntr_query_slot perfcntr;
88 };
89 
90 struct PACKED primitives_generated_query_slot {
91    struct query_slot common;
92    uint64_t result;
93    uint64_t begin;
94    uint64_t end;
95 };
96 
97 struct PACKED accel_struct_slot {
98    struct query_slot common;
99    uint64_t result;
100 };
101 
102 /* Returns the IOVA or mapped address of a given uint64_t field
103  * in a given slot of a query pool. */
104 #define query_iova(type, pool, query, field)                               \
105    pool->bo->iova + pool->query_stride * (query) + offsetof(type, field)
106 #define query_addr(type, pool, query, field)                               \
107    (uint64_t *) ((char *) pool->bo->map + pool->query_stride * (query) +   \
108                  offsetof(type, field))
109 
110 #define occlusion_query_iova(pool, query, field)                           \
111    query_iova(struct occlusion_query_slot, pool, query, field)
112 #define occlusion_query_addr(pool, query, field)                           \
113    query_addr(struct occlusion_query_slot, pool, query, field)
114 
115 #define pipeline_stat_query_iova(pool, query, field, idx)                  \
116    pool->bo->iova + pool->query_stride * (query) +                         \
117       offsetof_arr(struct pipeline_stat_query_slot, field, (idx))
118 
119 #define primitive_query_iova(pool, query, field, stream_id, i)             \
120    query_iova(struct primitive_query_slot, pool, query, field) +           \
121       sizeof_field(struct primitive_query_slot, field[0]) * (stream_id) +  \
122       offsetof_arr(struct primitive_slot_value, values, (i))
123 
124 #define perf_query_iova(pool, query, field, i)                             \
125    pool->bo->iova + pool->query_stride * (query) +                         \
126    sizeof(struct query_slot) +                                             \
127    sizeof(struct perfcntr_query_slot) * (i) +                              \
128    offsetof(struct perfcntr_query_slot, field)
129 
130 #define primitives_generated_query_iova(pool, query, field)                \
131    query_iova(struct primitives_generated_query_slot, pool, query, field)
132 
133 #define query_available_iova(pool, query)                                  \
134    query_iova(struct query_slot, pool, query, available)
135 
136 #define query_result_iova(pool, query, type, i)                            \
137    pool->bo->iova + pool->query_stride * (query) +                         \
138    sizeof(struct query_slot) + sizeof(type) * (i)
139 
140 #define query_result_addr(pool, query, type, i)                            \
141    (uint64_t *) ((char *) pool->bo->map + pool->query_stride * (query) +   \
142                  sizeof(struct query_slot) + sizeof(type) * (i))
143 
144 #define query_is_available(slot) slot->available
145 
146 static const VkPerformanceCounterUnitKHR
147 fd_perfcntr_type_to_vk_unit[] = {
148    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
149    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
150    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
151    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
152    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
153    /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
154    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
155    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
156    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
157    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
158    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
159    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
160    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
161 };
162 
163 /* TODO. Basically this comes from the freedreno implementation where
164  * only UINT64 is used. We'd better confirm this by the blob vulkan driver
165  * when it starts supporting perf query.
166  */
167 static const VkPerformanceCounterStorageKHR
168 fd_perfcntr_type_to_vk_storage[] = {
169    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
170    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
171    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
172    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
173    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
174    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
175    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
176    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
177    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
178    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
179    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
180    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
181 };
182 
183 /*
184  * Returns a pointer to a given slot in a query pool.
185  */
186 static struct query_slot *
slot_address(struct tu_query_pool * pool,uint32_t query)187 slot_address(struct tu_query_pool *pool, uint32_t query)
188 {
189    return (struct query_slot *) ((char *) pool->bo->map +
190                                  query * pool->query_stride);
191 }
192 
193 static void
perfcntr_index(const struct fd_perfcntr_group * group,uint32_t group_count,uint32_t index,uint32_t * gid,uint32_t * cid)194 perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
195                uint32_t index, uint32_t *gid, uint32_t *cid)
196 
197 {
198    uint32_t i;
199 
200    for (i = 0; i < group_count; i++) {
201       if (group[i].num_countables > index) {
202          *gid = i;
203          *cid = index;
204          break;
205       }
206       index -= group[i].num_countables;
207    }
208 
209    assert(i < group_count);
210 }
211 
212 static int
compare_perfcntr_pass(const void * a,const void * b)213 compare_perfcntr_pass(const void *a, const void *b)
214 {
215    return ((struct tu_perf_query_data *)a)->pass -
216           ((struct tu_perf_query_data *)b)->pass;
217 }
218 
219 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)220 tu_CreateQueryPool(VkDevice _device,
221                    const VkQueryPoolCreateInfo *pCreateInfo,
222                    const VkAllocationCallbacks *pAllocator,
223                    VkQueryPool *pQueryPool)
224 {
225    VK_FROM_HANDLE(tu_device, device, _device);
226    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
227    assert(pCreateInfo->queryCount > 0);
228 
229    uint32_t pool_size, slot_size;
230    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
231 
232    pool_size = sizeof(struct tu_query_pool);
233 
234    switch (pCreateInfo->queryType) {
235    case VK_QUERY_TYPE_OCCLUSION:
236       slot_size = sizeof(struct occlusion_query_slot);
237       break;
238    case VK_QUERY_TYPE_TIMESTAMP:
239       slot_size = sizeof(struct timestamp_query_slot);
240       break;
241    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
242       slot_size = sizeof(struct primitive_query_slot);
243       break;
244    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
245       slot_size = sizeof(struct primitives_generated_query_slot);
246       break;
247    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
248       perf_query_info =
249             vk_find_struct_const(pCreateInfo->pNext,
250                                  QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
251       assert(perf_query_info);
252 
253       slot_size = sizeof(struct perf_query_slot) +
254                   sizeof(struct perfcntr_query_slot) *
255                   (perf_query_info->counterIndexCount - 1);
256 
257       /* Size of the array pool->tu_perf_query_data */
258       pool_size += sizeof(struct tu_perf_query_data) *
259                    perf_query_info->counterIndexCount;
260       break;
261    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
262    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
263    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
264    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
265       slot_size = sizeof(struct accel_struct_slot);
266       break;
267    }
268    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
269       slot_size = sizeof(struct pipeline_stat_query_slot);
270       break;
271    default:
272       unreachable("Invalid query type");
273    }
274 
275    struct tu_query_pool *pool = (struct tu_query_pool *)
276          vk_query_pool_create(&device->vk, pCreateInfo,
277                               pAllocator, pool_size);
278    if (!pool)
279       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
280 
281    if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
282       pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
283                                       &pool->perf_group_count);
284 
285       pool->counter_index_count = perf_query_info->counterIndexCount;
286 
287       /* Build all perf counters data that is requested, so we could get
288        * correct group id, countable id, counter register and pass index with
289        * only a counter index provided by applications at each command submit.
290        *
291        * Also, since this built data will be sorted by pass index later, we
292        * should keep the original indices and store perfcntrs results according
293        * to them so apps can get correct results with their own indices.
294        */
295       uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
296       memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
297       memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
298 
299       for (uint32_t i = 0; i < pool->counter_index_count; i++) {
300          uint32_t gid = 0, cid = 0;
301 
302          perfcntr_index(pool->perf_group, pool->perf_group_count,
303                         perf_query_info->pCounterIndices[i], &gid, &cid);
304 
305          pool->perf_query_data[i].gid = gid;
306          pool->perf_query_data[i].cid = cid;
307          pool->perf_query_data[i].app_idx = i;
308 
309          /* When a counter register is over the capacity(num_counters),
310           * reset it for next pass.
311           */
312          if (regs[gid] < pool->perf_group[gid].num_counters) {
313             pool->perf_query_data[i].cntr_reg = regs[gid]++;
314             pool->perf_query_data[i].pass = pass[gid];
315          } else {
316             pool->perf_query_data[i].pass = ++pass[gid];
317             pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
318             regs[gid]++;
319          }
320       }
321 
322       /* Sort by pass index so we could easily prepare a command stream
323        * with the ascending order of pass index.
324        */
325       qsort(pool->perf_query_data, pool->counter_index_count,
326             sizeof(pool->perf_query_data[0]),
327             compare_perfcntr_pass);
328    }
329 
330    VkResult result = tu_bo_init_new(device, &pool->vk.base, &pool->bo,
331          pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
332    if (result != VK_SUCCESS) {
333       vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
334       return result;
335    }
336 
337    result = tu_bo_map(device, pool->bo, NULL);
338    if (result != VK_SUCCESS) {
339       tu_bo_finish(device, pool->bo);
340       vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
341       return result;
342    }
343 
344    /* Initialize all query statuses to unavailable */
345    memset(pool->bo->map, 0, pool->bo->size);
346 
347    pool->size = pCreateInfo->queryCount;
348    pool->query_stride = slot_size;
349 
350    TU_RMV(query_pool_create, device, pool);
351 
352    *pQueryPool = tu_query_pool_to_handle(pool);
353 
354    return VK_SUCCESS;
355 }
356 
357 VKAPI_ATTR void VKAPI_CALL
tu_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)358 tu_DestroyQueryPool(VkDevice _device,
359                     VkQueryPool _pool,
360                     const VkAllocationCallbacks *pAllocator)
361 {
362    VK_FROM_HANDLE(tu_device, device, _device);
363    VK_FROM_HANDLE(tu_query_pool, pool, _pool);
364 
365    if (!pool)
366       return;
367 
368    TU_RMV(resource_destroy, device, pool);
369 
370    tu_bo_finish(device, pool->bo);
371    vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
372 }
373 
374 static uint32_t
get_result_count(struct tu_query_pool * pool)375 get_result_count(struct tu_query_pool *pool)
376 {
377    switch (pool->vk.query_type) {
378    /* Occulusion and timestamp queries write one integer value */
379    case VK_QUERY_TYPE_OCCLUSION:
380    case VK_QUERY_TYPE_TIMESTAMP:
381    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
382    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
383    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
384    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
385    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
386       return 1;
387    /* Transform feedback queries write two integer values */
388    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
389       return 2;
390    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
391       return util_bitcount(pool->vk.pipeline_statistics);
392    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
393       return pool->counter_index_count;
394    default:
395       assert(!"Invalid query type");
396       return 0;
397    }
398 }
399 
400 static uint32_t
statistics_index(uint32_t * statistics)401 statistics_index(uint32_t *statistics)
402 {
403    uint32_t stat;
404    stat = u_bit_scan(statistics);
405 
406    switch (1 << stat) {
407    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
408       return 0;
409    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
410       return 1;
411    case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
412       return 2;
413    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
414       return 5;
415    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
416       return 6;
417    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
418       return 7;
419    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
420       return 8;
421    case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
422       return 9;
423    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
424       return 3;
425    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
426       return 4;
427    case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
428       return 10;
429    default:
430       return 0;
431    }
432 }
433 
434 static bool
is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)435 is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
436 {
437    return pipeline_statistics &
438           (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
439            VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
440            VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
441            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
442            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
443            VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
444            VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
445            VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
446            VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT);
447 }
448 
449 static bool
is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)450 is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
451 {
452    return pipeline_statistics &
453           VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
454 }
455 
456 static bool
is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)457 is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
458 {
459    return pipeline_statistics &
460           VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
461 }
462 
463 /* Wait on the the availability status of a query up until a timeout. */
464 static VkResult
wait_for_available(struct tu_device * device,struct tu_query_pool * pool,uint32_t query)465 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
466                    uint32_t query)
467 {
468    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
469     * scheduler friendly way instead of busy polling once the patch has landed
470     * upstream. */
471    struct query_slot *slot = slot_address(pool, query);
472    uint64_t abs_timeout = os_time_get_absolute_timeout(
473          WAIT_TIMEOUT * NSEC_PER_SEC);
474    while(os_time_get_nano() < abs_timeout) {
475       if (query_is_available(slot))
476          return VK_SUCCESS;
477    }
478    return vk_error(device, VK_TIMEOUT);
479 }
480 
481 /* Writes a query value to a buffer from the CPU. */
482 static void
write_query_value_cpu(char * base,uint32_t offset,uint64_t value,VkQueryResultFlags flags)483 write_query_value_cpu(char* base,
484                       uint32_t offset,
485                       uint64_t value,
486                       VkQueryResultFlags flags)
487 {
488    if (flags & VK_QUERY_RESULT_64_BIT) {
489       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
490    } else {
491       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
492    }
493 }
494 
495 static VkResult
get_query_pool_results(struct tu_device * device,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)496 get_query_pool_results(struct tu_device *device,
497                        struct tu_query_pool *pool,
498                        uint32_t firstQuery,
499                        uint32_t queryCount,
500                        size_t dataSize,
501                        void *pData,
502                        VkDeviceSize stride,
503                        VkQueryResultFlags flags)
504 {
505    assert(dataSize >= stride * queryCount);
506 
507    char *result_base = (char *) pData;
508    VkResult result = VK_SUCCESS;
509    for (uint32_t i = 0; i < queryCount; i++) {
510       uint32_t query = firstQuery + i;
511       struct query_slot *slot = slot_address(pool, query);
512       bool available = query_is_available(slot);
513       uint32_t result_count = get_result_count(pool);
514       uint32_t statistics = pool->vk.pipeline_statistics;
515 
516       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
517          VkResult wait_result = wait_for_available(device, pool, query);
518          if (wait_result != VK_SUCCESS)
519             return wait_result;
520          available = true;
521       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
522          /* From the Vulkan 1.1.130 spec:
523           *
524           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
525           *    both not set then no result values are written to pData for
526           *    queries that are in the unavailable state at the time of the
527           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
528           *    availability state is still written to pData for those queries
529           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
530           */
531          result = VK_NOT_READY;
532          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
533             result_base += stride;
534             continue;
535          }
536       }
537 
538       for (uint32_t k = 0; k < result_count; k++) {
539          if (available) {
540             uint64_t *result;
541 
542             if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
543                uint32_t stat_idx = statistics_index(&statistics);
544                result = query_result_addr(pool, query, uint64_t, stat_idx);
545             } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
546                result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
547             } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
548                assert(k == 0);
549                result = occlusion_query_addr(pool, query, result);
550             } else {
551                result = query_result_addr(pool, query, uint64_t, k);
552             }
553 
554             write_query_value_cpu(result_base, k, *result, flags);
555          } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
556              /* From the Vulkan 1.1.130 spec:
557               *
558               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
559               *   is not set, and the query’s status is unavailable, an
560               *   intermediate result value between zero and the final result
561               *   value is written to pData for that query.
562               *
563               * Just return 0 here for simplicity since it's a valid result.
564               */
565             write_query_value_cpu(result_base, k, 0, flags);
566       }
567 
568       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
569          /* From the Vulkan 1.1.130 spec:
570           *
571           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
572           *    integer value written for each query is non-zero if the query’s
573           *    status was available or zero if the status was unavailable.
574           */
575          write_query_value_cpu(result_base, result_count, available, flags);
576 
577       result_base += stride;
578    }
579    return result;
580 }
581 
582 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)583 tu_GetQueryPoolResults(VkDevice _device,
584                        VkQueryPool queryPool,
585                        uint32_t firstQuery,
586                        uint32_t queryCount,
587                        size_t dataSize,
588                        void *pData,
589                        VkDeviceSize stride,
590                        VkQueryResultFlags flags)
591 {
592    VK_FROM_HANDLE(tu_device, device, _device);
593    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
594    assert(firstQuery + queryCount <= pool->size);
595 
596    if (vk_device_is_lost(&device->vk))
597       return VK_ERROR_DEVICE_LOST;
598 
599    switch (pool->vk.query_type) {
600    case VK_QUERY_TYPE_OCCLUSION:
601    case VK_QUERY_TYPE_TIMESTAMP:
602    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
603    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
604    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
605    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
606    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
607    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
608    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
609    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
610       return get_query_pool_results(device, pool, firstQuery, queryCount,
611                                     dataSize, pData, stride, flags);
612    default:
613       assert(!"Invalid query type");
614    }
615    return VK_SUCCESS;
616 }
617 
618 /* Copies a query value from one buffer to another from the GPU. */
619 static void
copy_query_value_gpu(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,uint64_t src_iova,uint64_t base_write_iova,uint32_t offset,VkQueryResultFlags flags)620 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
621                      struct tu_cs *cs,
622                      uint64_t src_iova,
623                      uint64_t base_write_iova,
624                      uint32_t offset,
625                      VkQueryResultFlags flags) {
626    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
627          sizeof(uint64_t) : sizeof(uint32_t);
628    uint64_t write_iova = base_write_iova + (offset * element_size);
629 
630    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
631    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
632          CP_MEM_TO_MEM_0_DOUBLE : 0;
633    tu_cs_emit(cs, mem_to_mem_flags);
634    tu_cs_emit_qw(cs, write_iova);
635    tu_cs_emit_qw(cs, src_iova);
636 }
637 
638 template <chip CHIP>
639 static void
emit_copy_query_pool_results(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,struct tu_buffer * buffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)640 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
641                              struct tu_cs *cs,
642                              struct tu_query_pool *pool,
643                              uint32_t firstQuery,
644                              uint32_t queryCount,
645                              struct tu_buffer *buffer,
646                              VkDeviceSize dstOffset,
647                              VkDeviceSize stride,
648                              VkQueryResultFlags flags)
649 {
650    /* Flush cache for the buffer to copy to. */
651    tu_emit_cache_flush<CHIP>(cmdbuf);
652 
653    /* From the Vulkan 1.1.130 spec:
654     *
655     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
656     *    uses of vkCmdResetQueryPool in the same queue, without any additional
657     *    synchronization.
658     *
659     * To ensure that previous writes to the available bit are coherent, first
660     * wait for all writes to complete.
661     */
662    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
663 
664    for (uint32_t i = 0; i < queryCount; i++) {
665       uint32_t query = firstQuery + i;
666       uint64_t available_iova = query_available_iova(pool, query);
667       uint64_t buffer_iova = buffer->iova + dstOffset + i * stride;
668       uint32_t result_count = get_result_count(pool);
669       uint32_t statistics = pool->vk.pipeline_statistics;
670 
671       /* Wait for the available bit to be set if executed with the
672        * VK_QUERY_RESULT_WAIT_BIT flag. */
673       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
674          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
675          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
676                         CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
677          tu_cs_emit_qw(cs, available_iova);
678          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
679          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
680          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
681       }
682 
683       for (uint32_t k = 0; k < result_count; k++) {
684          uint64_t result_iova;
685 
686          if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
687             uint32_t stat_idx = statistics_index(&statistics);
688             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
689          } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
690             result_iova = query_result_iova(pool, query,
691                                             struct perfcntr_query_slot, k);
692          } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
693             assert(k == 0);
694             result_iova = occlusion_query_iova(pool, query, result);
695          } else {
696             result_iova = query_result_iova(pool, query, uint64_t, k);
697          }
698 
699          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
700             /* Unconditionally copying the bo->result into the buffer here is
701              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
702              * if the query is unavailable, this will copy the correct partial
703              * value of 0.
704              */
705             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
706                                  k /* offset */, flags);
707          } else {
708             /* Conditionally copy bo->result into the buffer based on whether the
709              * query is available.
710              *
711              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
712              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
713              * that 0 < available < 2, aka available == 1.
714              */
715             tu_cs_reserve(cs, 7 + 6);
716             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
717             tu_cs_emit_qw(cs, available_iova);
718             tu_cs_emit_qw(cs, available_iova);
719             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
720             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
721 
722             /* Start of conditional execution */
723             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
724                               k /* offset */, flags);
725             /* End of conditional execution */
726          }
727       }
728 
729       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
730          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
731                               result_count /* offset */, flags);
732       }
733    }
734 }
735 
736 template <chip CHIP>
737 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)738 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
739                            VkQueryPool queryPool,
740                            uint32_t firstQuery,
741                            uint32_t queryCount,
742                            VkBuffer dstBuffer,
743                            VkDeviceSize dstOffset,
744                            VkDeviceSize stride,
745                            VkQueryResultFlags flags)
746 {
747    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
748    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
749    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
750    struct tu_cs *cs = &cmdbuf->cs;
751    assert(firstQuery + queryCount <= pool->size);
752 
753    switch (pool->vk.query_type) {
754    case VK_QUERY_TYPE_OCCLUSION:
755    case VK_QUERY_TYPE_TIMESTAMP:
756    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
757    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
758    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
759    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
760    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
761    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
762    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
763       return emit_copy_query_pool_results<CHIP>(cmdbuf, cs, pool, firstQuery,
764                                                 queryCount, buffer, dstOffset,
765                                                 stride, flags);
766    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
767       unreachable("allowCommandBufferQueryCopies is false");
768    default:
769       assert(!"Invalid query type");
770    }
771 }
772 TU_GENX(tu_CmdCopyQueryPoolResults);
773 
774 static void
emit_reset_query_pool(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount)775 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
776                       struct tu_query_pool *pool,
777                       uint32_t firstQuery,
778                       uint32_t queryCount)
779 {
780    struct tu_cs *cs = &cmdbuf->cs;
781 
782    for (uint32_t i = 0; i < queryCount; i++) {
783       uint32_t query = firstQuery + i;
784       uint32_t statistics = pool->vk.pipeline_statistics;
785 
786       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
787       tu_cs_emit_qw(cs, query_available_iova(pool, query));
788       tu_cs_emit_qw(cs, 0x0);
789 
790       for (uint32_t k = 0; k < get_result_count(pool); k++) {
791          uint64_t result_iova;
792 
793          if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
794             uint32_t stat_idx = statistics_index(&statistics);
795             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
796          } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
797             result_iova = query_result_iova(pool, query,
798                                             struct perfcntr_query_slot, k);
799          } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
800             assert(k == 0);
801             result_iova = occlusion_query_iova(pool, query, result);
802          } else {
803             result_iova = query_result_iova(pool, query, uint64_t, k);
804          }
805 
806          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
807          tu_cs_emit_qw(cs, result_iova);
808          tu_cs_emit_qw(cs, 0x0);
809       }
810    }
811 
812 }
813 
814 VKAPI_ATTR void VKAPI_CALL
tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)815 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
816                      VkQueryPool queryPool,
817                      uint32_t firstQuery,
818                      uint32_t queryCount)
819 {
820    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
821    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
822 
823    switch (pool->vk.query_type) {
824    case VK_QUERY_TYPE_TIMESTAMP:
825    case VK_QUERY_TYPE_OCCLUSION:
826    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
827    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
828    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
829    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
830    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
831    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
832    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
833    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
834       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
835       break;
836    default:
837       assert(!"Invalid query type");
838    }
839 }
840 
841 VKAPI_ATTR void VKAPI_CALL
tu_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)842 tu_ResetQueryPool(VkDevice device,
843                   VkQueryPool queryPool,
844                   uint32_t firstQuery,
845                   uint32_t queryCount)
846 {
847    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
848 
849    for (uint32_t i = 0; i < queryCount; i++) {
850       struct query_slot *slot = slot_address(pool, i + firstQuery);
851       slot->available = 0;
852 
853       for (uint32_t k = 0; k < get_result_count(pool); k++) {
854          uint64_t *res;
855 
856          if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
857             res = query_result_addr(pool, i + firstQuery,
858                                     struct perfcntr_query_slot, k);
859          } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
860             assert(k == 0);
861             res = occlusion_query_addr(pool, i + firstQuery, result);
862          } else {
863             res = query_result_addr(pool, i + firstQuery, uint64_t, k);
864          }
865 
866          *res = 0;
867       }
868    }
869 }
870 
871 template <chip CHIP>
872 static void
emit_begin_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)873 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
874                            struct tu_query_pool *pool,
875                            uint32_t query)
876 {
877    /* From the Vulkan 1.1.130 spec:
878     *
879     *    A query must begin and end inside the same subpass of a render pass
880     *    instance, or must both begin and end outside of a render pass
881     *    instance.
882     *
883     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
884     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
885     * query begins/ends inside the same subpass of a render pass, we need to
886     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
887     * is then run on every tile during render, so we just need to accumulate
888     * sample counts in slot->result to compute the query result.
889     */
890    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
891 
892    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
893 
894    tu_cs_emit_regs(cs,
895                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
896 
897    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
898       tu_cs_emit_regs(cs,
899                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
900       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
901       tu_cs_emit(cs, ZPASS_DONE);
902       if (CHIP == A7XX) {
903          /* Copied from blob's cmdstream, not sure why it is done. */
904          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
905          tu_cs_emit(cs, CCU_CLEAN_DEPTH);
906       }
907    } else {
908       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
909       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
910                                        .write_sample_count = true).value);
911       tu_cs_emit_qw(cs, begin_iova);
912 
913       /* ZPASS_DONE events should come in begin-end pairs. When emitting and
914        * occlusion query outside of a renderpass, we emit a fake end event that
915        * closes the previous one since the autotuner's ZPASS_DONE use could end
916        * up causing problems. This events writes into the end field of the query
917        * slot, but it will be overwritten by events in emit_end_occlusion_query
918        * with the proper value.
919        * When inside a renderpass, the corresponding ZPASS_DONE event will be
920        * emitted in emit_end_occlusion_query. We note the use of ZPASS_DONE on
921        * the state object, enabling autotuner to optimize its own events.
922        */
923       if (!cmdbuf->state.pass) {
924          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
925          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
926                                           .write_sample_count = true,
927                                           .sample_count_end_offset = true,
928                                           .write_accum_sample_count_diff = true).value);
929          tu_cs_emit_qw(cs, begin_iova);
930       } else {
931          cmdbuf->state.rp.has_zpass_done_sample_count_write_in_rp = true;
932       }
933    }
934 }
935 
936 template <chip CHIP>
937 static void
emit_begin_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)938 emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
939                       struct tu_query_pool *pool,
940                       uint32_t query)
941 {
942    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
943    uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin, 0);
944 
945    if (is_pipeline_query_with_vertex_stage(pool->vk.pipeline_statistics)) {
946       bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
947       cmdbuf->state.prim_counters_running++;
948 
949       /* Prevent starting primitive counters when it is supposed to be stopped
950        * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
951        */
952       if (need_cond_exec) {
953          tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
954                         CP_COND_REG_EXEC_0_SYSMEM |
955                         CP_COND_REG_EXEC_0_BINNING);
956       }
957 
958       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
959 
960       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
961       tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
962       tu_cs_emit(cs, 0);
963 
964       if (need_cond_exec) {
965          tu_cond_exec_end(cs);
966       }
967    }
968 
969    if (is_pipeline_query_with_fragment_stage(pool->vk.pipeline_statistics)) {
970       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_FRAGMENT_CTRS);
971    }
972 
973    if (is_pipeline_query_with_compute_stage(pool->vk.pipeline_statistics)) {
974       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_COMPUTE_CTRS);
975    }
976 
977    tu_cs_emit_wfi(cs);
978 
979    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
980    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
981                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
982                   CP_REG_TO_MEM_0_64B);
983    tu_cs_emit_qw(cs, begin_iova);
984 }
985 
986 static void
emit_perfcntrs_pass_start(struct tu_cs * cs,uint32_t pass)987 emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
988 {
989    tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
990    tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
991                         REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
992                   A6XX_CP_REG_TEST_0_BIT(pass) |
993                   A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
994    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
995 }
996 
997 static void
emit_begin_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)998 emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
999                            struct tu_query_pool *pool,
1000                            uint32_t query)
1001 {
1002    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1003    uint32_t last_pass = ~0;
1004 
1005    if (cmdbuf->state.pass) {
1006       cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
1007    }
1008 
1009    /* Querying perf counters happens in these steps:
1010     *
1011     *  0) There's a scratch reg to set a pass index for perf counters query.
1012     *     Prepare cmd streams to set each pass index to the reg at device
1013     *     creation time. See tu_CreateDevice in tu_device.c
1014     *  1) Emit command streams to read all requested perf counters at all
1015     *     passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
1016     *     reads the scratch reg where pass index is set.
1017     *     See emit_perfcntrs_pass_start.
1018     *  2) Pick the right cs setting proper pass index to the reg and prepend
1019     *     it to the command buffer at each submit time.
1020     *     See tu_queue_build_msm_gem_submit_cmds in tu_knl_drm_msm.cc and
1021     *     tu_knl_drm_virtio.cc and kgsl_queue_submit in tu_knl_kgsl.cc
1022     *  3) If the pass index in the reg is true, then executes the command
1023     *     stream below CP_COND_REG_EXEC.
1024     */
1025 
1026    tu_cs_emit_wfi(cs);
1027 
1028    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1029       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1030 
1031       if (last_pass != data->pass) {
1032          last_pass = data->pass;
1033 
1034          if (data->pass != 0)
1035             tu_cond_exec_end(cs);
1036          emit_perfcntrs_pass_start(cs, data->pass);
1037       }
1038 
1039       const struct fd_perfcntr_counter *counter =
1040             &pool->perf_group[data->gid].counters[data->cntr_reg];
1041       const struct fd_perfcntr_countable *countable =
1042             &pool->perf_group[data->gid].countables[data->cid];
1043 
1044       tu_cs_emit_pkt4(cs, counter->select_reg, 1);
1045       tu_cs_emit(cs, countable->selector);
1046    }
1047    tu_cond_exec_end(cs);
1048 
1049    last_pass = ~0;
1050    tu_cs_emit_wfi(cs);
1051 
1052    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1053       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1054 
1055       if (last_pass != data->pass) {
1056          last_pass = data->pass;
1057 
1058          if (data->pass != 0)
1059             tu_cond_exec_end(cs);
1060          emit_perfcntrs_pass_start(cs, data->pass);
1061       }
1062 
1063       const struct fd_perfcntr_counter *counter =
1064             &pool->perf_group[data->gid].counters[data->cntr_reg];
1065 
1066       uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1067 
1068       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1069       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1070                      CP_REG_TO_MEM_0_64B);
1071       tu_cs_emit_qw(cs, begin_iova);
1072    }
1073    tu_cond_exec_end(cs);
1074 }
1075 
1076 template <chip CHIP>
1077 static void
emit_begin_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1078 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
1079                      struct tu_query_pool *pool,
1080                      uint32_t query,
1081                      uint32_t stream_id)
1082 {
1083    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1084    uint64_t begin_iova = primitive_query_iova(pool, query, begin, 0, 0);
1085 
1086    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
1087    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1088 }
1089 
1090 template <chip CHIP>
1091 static void
emit_begin_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1092 emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1093                                 struct tu_query_pool *pool,
1094                                 uint32_t query)
1095 {
1096    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1097    uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1098 
1099    if (cmdbuf->state.pass) {
1100       cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
1101    } else {
1102       cmdbuf->state.prim_generated_query_running_before_rp = true;
1103    }
1104 
1105    cmdbuf->state.prim_counters_running++;
1106 
1107    if (cmdbuf->state.pass) {
1108       /* Primitives that passed all tests are still counted in in each
1109        * tile even with HW binning beforehand. Do not permit it.
1110        */
1111       tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1112                            CP_COND_REG_EXEC_0_SYSMEM |
1113                            CP_COND_REG_EXEC_0_BINNING);
1114    }
1115 
1116    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
1117 
1118    tu_cs_emit_wfi(cs);
1119 
1120    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1121    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1122                   CP_REG_TO_MEM_0_CNT(2) |
1123                   CP_REG_TO_MEM_0_64B);
1124    tu_cs_emit_qw(cs, begin_iova);
1125 
1126    if (cmdbuf->state.pass) {
1127       tu_cond_exec_end(cs);
1128    }
1129 }
1130 
1131 template <chip CHIP>
1132 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)1133 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
1134                            VkQueryPool queryPool,
1135                            uint32_t query,
1136                            VkQueryControlFlags flags,
1137                            uint32_t index)
1138 {
1139    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1140    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1141    assert(query < pool->size);
1142 
1143    switch (pool->vk.query_type) {
1144    case VK_QUERY_TYPE_OCCLUSION:
1145       /* In freedreno, there is no implementation difference between
1146        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
1147        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
1148        */
1149       emit_begin_occlusion_query<CHIP>(cmdbuf, pool, query);
1150       break;
1151    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1152       emit_begin_xfb_query<CHIP>(cmdbuf, pool, query, index);
1153       break;
1154    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1155       emit_begin_prim_generated_query<CHIP>(cmdbuf, pool, query);
1156       break;
1157    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1158       emit_begin_perf_query(cmdbuf, pool, query);
1159       break;
1160    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1161       emit_begin_stat_query<CHIP>(cmdbuf, pool, query);
1162       break;
1163    case VK_QUERY_TYPE_TIMESTAMP:
1164       unreachable("Unimplemented query type");
1165    default:
1166       assert(!"Invalid query type");
1167    }
1168 }
1169 TU_GENX(tu_CmdBeginQueryIndexedEXT);
1170 
1171 template <chip CHIP>
1172 static void
emit_end_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1173 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
1174                          struct tu_query_pool *pool,
1175                          uint32_t query)
1176 {
1177    /* Ending an occlusion query happens in a few steps:
1178     *    1) Set the slot->end to UINT64_MAX.
1179     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
1180     *       write the current sample count value into slot->end.
1181     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
1182     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
1183     *    4) Accumulate the results of the query (slot->end - slot->begin) into
1184     *       slot->result.
1185     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
1186     *       pass, set the slot's available bit since the query is now done.
1187     *    6) If vkCmdEndQuery *is* called from within the scope of a render
1188     *       pass, we cannot mark as available yet since the commands in
1189     *       draw_cs are not run until vkCmdEndRenderPass.
1190     */
1191    const struct tu_render_pass *pass = cmdbuf->state.pass;
1192    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1193 
1194    struct tu_cs *epilogue_cs = &cmdbuf->cs;
1195    if (pass)
1196       /* Technically, queries should be tracked per-subpass, but here we track
1197        * at the render pass level to simply the code a bit. This is safe
1198        * because the only commands that use the available bit are
1199        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
1200        * cannot be invoked from inside a render pass scope.
1201        */
1202       epilogue_cs = &cmdbuf->draw_epilogue_cs;
1203 
1204    uint64_t available_iova = query_available_iova(pool, query);
1205    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
1206    uint64_t result_iova = occlusion_query_iova(pool, query, result);
1207    uint64_t end_iova = occlusion_query_iova(pool, query, end);
1208 
1209    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
1210       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1211       tu_cs_emit_qw(cs, end_iova);
1212       tu_cs_emit_qw(cs, 0xffffffffffffffffull);
1213 
1214       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1215    }
1216 
1217    tu_cs_emit_regs(cs,
1218                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
1219 
1220    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
1221       tu_cs_emit_regs(cs,
1222                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
1223       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1224       tu_cs_emit(cs, ZPASS_DONE);
1225       if (CHIP == A7XX) {
1226          /* Copied from blob's cmdstream, not sure why it is done. */
1227          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1228          tu_cs_emit(cs, CCU_CLEAN_DEPTH);
1229       }
1230 
1231       tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1232       tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
1233                      CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
1234       tu_cs_emit_qw(cs, end_iova);
1235       tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
1236       tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1237       tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1238 
1239       /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
1240       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1241       tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
1242       tu_cs_emit_qw(cs, result_iova);
1243       tu_cs_emit_qw(cs, result_iova);
1244       tu_cs_emit_qw(cs, end_iova);
1245       tu_cs_emit_qw(cs, begin_iova);
1246 
1247       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1248    } else {
1249       /* When outside of renderpass, potential autotuner activity can cause
1250        * interference between ZPASS_DONE event pairs. In that case, like at the
1251        * beginning of the occlusion query, a fake ZPASS_DONE event is emitted to
1252        * compose a begin-end event pair. The first event will write into the end
1253        * field, but that will be overwritten by the second ZPASS_DONE which will
1254        * also handle the diff accumulation.
1255        */
1256       if (!cmdbuf->state.pass) {
1257          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
1258          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
1259                                           .write_sample_count = true).value);
1260          tu_cs_emit_qw(cs, end_iova);
1261       }
1262 
1263       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
1264       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
1265                                        .write_sample_count = true,
1266                                        .sample_count_end_offset = true,
1267                                        .write_accum_sample_count_diff = true).value);
1268       tu_cs_emit_qw(cs, begin_iova);
1269 
1270       tu_cs_emit_wfi(cs);
1271 
1272       if (cmdbuf->device->physical_device->info->a7xx.has_generic_clear) {
1273          /* If the next renderpass uses the same depth attachment, clears it
1274           * with generic clear - ZPASS_DONE may somehow read stale values that
1275           * are apparently invalidated by CCU_INVALIDATE_DEPTH.
1276           * See dEQP-VK.fragment_operations.early_fragment.sample_count_early_fragment_tests_depth_*
1277           */
1278          tu_emit_event_write<CHIP>(cmdbuf, epilogue_cs,
1279                                    FD_CCU_INVALIDATE_DEPTH);
1280       }
1281    }
1282 
1283    tu_cs_emit_pkt7(epilogue_cs, CP_MEM_WRITE, 4);
1284    tu_cs_emit_qw(epilogue_cs, available_iova);
1285    tu_cs_emit_qw(epilogue_cs, 0x1);
1286 }
1287 
1288 /* PRIMITIVE_CTRS is used for two distinct queries:
1289  * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
1290  * - VK_QUERY_TYPE_PIPELINE_STATISTICS
1291  * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
1292  * only for outer query.
1293  *
1294  * Also, pipeline stat query could run outside of renderpass and prim gen
1295  * query inside of secondary cmd buffer - for such case we ought to track
1296  * the status of pipeline stats query.
1297  */
1298 template <chip CHIP>
1299 static void
emit_stop_primitive_ctrs(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,VkQueryType query_type)1300 emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
1301                          struct tu_cs *cs,
1302                          VkQueryType query_type)
1303 {
1304    bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
1305    cmdbuf->state.prim_counters_running--;
1306    if (cmdbuf->state.prim_counters_running == 0) {
1307       bool need_cond_exec =
1308          is_secondary &&
1309          query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
1310          is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
1311 
1312       if (!need_cond_exec) {
1313          tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1314       } else {
1315          tu_cs_reserve(cs, 7 + 2);
1316          /* Check that pipeline stats query is not running, only then
1317           * we count stop the counter.
1318           */
1319          tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
1320          tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1321          tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1322          tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
1323          tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
1324 
1325          tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1326       }
1327    }
1328 
1329    if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1330       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1331       tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1332       tu_cs_emit(cs, 1);
1333    }
1334 }
1335 
1336 template <chip CHIP>
1337 static void
emit_end_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1338 emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
1339                     struct tu_query_pool *pool,
1340                     uint32_t query)
1341 {
1342    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1343    uint64_t end_iova = pipeline_stat_query_iova(pool, query, end, 0);
1344    uint64_t available_iova = query_available_iova(pool, query);
1345    uint64_t result_iova;
1346    uint64_t stat_start_iova;
1347    uint64_t stat_stop_iova;
1348 
1349    if (is_pipeline_query_with_vertex_stage(pool->vk.pipeline_statistics)) {
1350       /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
1351        * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
1352        * renderpass, because it is already stopped.
1353        */
1354       emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
1355    }
1356 
1357    if (is_pipeline_query_with_fragment_stage(pool->vk.pipeline_statistics)) {
1358       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_FRAGMENT_CTRS);
1359    }
1360 
1361    if (is_pipeline_query_with_compute_stage(pool->vk.pipeline_statistics)) {
1362       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_COMPUTE_CTRS);
1363    }
1364 
1365    tu_cs_emit_wfi(cs);
1366 
1367    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1368    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
1369                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
1370                   CP_REG_TO_MEM_0_64B);
1371    tu_cs_emit_qw(cs, end_iova);
1372 
1373    for (int i = 0; i < STAT_COUNT; i++) {
1374       result_iova = query_result_iova(pool, query, uint64_t, i);
1375       stat_start_iova = pipeline_stat_query_iova(pool, query, begin, i);
1376       stat_stop_iova = pipeline_stat_query_iova(pool, query, end, i);
1377 
1378       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1379       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1380                      CP_MEM_TO_MEM_0_DOUBLE |
1381                      CP_MEM_TO_MEM_0_NEG_C);
1382 
1383       tu_cs_emit_qw(cs, result_iova);
1384       tu_cs_emit_qw(cs, result_iova);
1385       tu_cs_emit_qw(cs, stat_stop_iova);
1386       tu_cs_emit_qw(cs, stat_start_iova);
1387    }
1388 
1389    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1390 
1391    if (cmdbuf->state.pass)
1392       cs = &cmdbuf->draw_epilogue_cs;
1393 
1394    /* Set the availability to 1 */
1395    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1396    tu_cs_emit_qw(cs, available_iova);
1397    tu_cs_emit_qw(cs, 0x1);
1398 }
1399 
1400 static void
emit_end_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1401 emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
1402                          struct tu_query_pool *pool,
1403                          uint32_t query)
1404 {
1405    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1406    uint64_t available_iova = query_available_iova(pool, query);
1407    uint64_t end_iova;
1408    uint64_t begin_iova;
1409    uint64_t result_iova;
1410    uint32_t last_pass = ~0;
1411 
1412    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1413       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1414 
1415       if (last_pass != data->pass) {
1416          last_pass = data->pass;
1417 
1418          if (data->pass != 0)
1419             tu_cond_exec_end(cs);
1420          emit_perfcntrs_pass_start(cs, data->pass);
1421       }
1422 
1423       const struct fd_perfcntr_counter *counter =
1424             &pool->perf_group[data->gid].counters[data->cntr_reg];
1425 
1426       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1427 
1428       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1429       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1430                      CP_REG_TO_MEM_0_64B);
1431       tu_cs_emit_qw(cs, end_iova);
1432    }
1433    tu_cond_exec_end(cs);
1434 
1435    last_pass = ~0;
1436    tu_cs_emit_wfi(cs);
1437 
1438    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1439       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1440 
1441       if (last_pass != data->pass) {
1442          last_pass = data->pass;
1443 
1444 
1445          if (data->pass != 0)
1446             tu_cond_exec_end(cs);
1447          emit_perfcntrs_pass_start(cs, data->pass);
1448       }
1449 
1450       result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
1451              data->app_idx);
1452       begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1453       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1454 
1455       /* result += end - begin */
1456       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1457       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1458                      CP_MEM_TO_MEM_0_DOUBLE |
1459                      CP_MEM_TO_MEM_0_NEG_C);
1460 
1461       tu_cs_emit_qw(cs, result_iova);
1462       tu_cs_emit_qw(cs, result_iova);
1463       tu_cs_emit_qw(cs, end_iova);
1464       tu_cs_emit_qw(cs, begin_iova);
1465    }
1466    tu_cond_exec_end(cs);
1467 
1468    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1469 
1470    if (cmdbuf->state.pass)
1471       cs = &cmdbuf->draw_epilogue_cs;
1472 
1473    /* Set the availability to 1 */
1474    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1475    tu_cs_emit_qw(cs, available_iova);
1476    tu_cs_emit_qw(cs, 0x1);
1477 }
1478 
1479 template <chip CHIP>
1480 static void
emit_end_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1481 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
1482                    struct tu_query_pool *pool,
1483                    uint32_t query,
1484                    uint32_t stream_id)
1485 {
1486    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1487 
1488    uint64_t end_iova = primitive_query_iova(pool, query, end, 0, 0);
1489    uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
1490    uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
1491    uint64_t begin_written_iova = primitive_query_iova(pool, query, begin, stream_id, 0);
1492    uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin, stream_id, 1);
1493    uint64_t end_written_iova = primitive_query_iova(pool, query, end, stream_id, 0);
1494    uint64_t end_generated_iova = primitive_query_iova(pool, query, end, stream_id, 1);
1495    uint64_t available_iova = query_available_iova(pool, query);
1496 
1497    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
1498    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1499 
1500    tu_cs_emit_wfi(cs);
1501    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_CLEAN);
1502 
1503    /* Set the count of written primitives */
1504    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1505    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1506                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1507    tu_cs_emit_qw(cs, result_written_iova);
1508    tu_cs_emit_qw(cs, result_written_iova);
1509    tu_cs_emit_qw(cs, end_written_iova);
1510    tu_cs_emit_qw(cs, begin_written_iova);
1511 
1512    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_CLEAN);
1513 
1514    /* Set the count of generated primitives */
1515    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1516    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1517                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1518    tu_cs_emit_qw(cs, result_generated_iova);
1519    tu_cs_emit_qw(cs, result_generated_iova);
1520    tu_cs_emit_qw(cs, end_generated_iova);
1521    tu_cs_emit_qw(cs, begin_generated_iova);
1522 
1523    /* Set the availability to 1 */
1524    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1525    tu_cs_emit_qw(cs, available_iova);
1526    tu_cs_emit_qw(cs, 0x1);
1527 }
1528 
1529 template <chip CHIP>
1530 static void
emit_end_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1531 emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1532                               struct tu_query_pool *pool,
1533                               uint32_t query)
1534 {
1535    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1536 
1537    if (!cmdbuf->state.pass) {
1538       cmdbuf->state.prim_generated_query_running_before_rp = false;
1539    }
1540 
1541    uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1542    uint64_t end_iova = primitives_generated_query_iova(pool, query, end);
1543    uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
1544    uint64_t available_iova = query_available_iova(pool, query);
1545 
1546    if (cmdbuf->state.pass) {
1547       tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1548                              CP_COND_REG_EXEC_0_SYSMEM |
1549                              CP_COND_REG_EXEC_0_BINNING);
1550    }
1551 
1552    tu_cs_emit_wfi(cs);
1553 
1554    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1555    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1556                   CP_REG_TO_MEM_0_CNT(2) |
1557                   CP_REG_TO_MEM_0_64B);
1558    tu_cs_emit_qw(cs, end_iova);
1559 
1560    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1561    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1562                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
1563    tu_cs_emit_qw(cs, result_iova);
1564    tu_cs_emit_qw(cs, result_iova);
1565    tu_cs_emit_qw(cs, end_iova);
1566    tu_cs_emit_qw(cs, begin_iova);
1567 
1568    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1569 
1570    /* Should be after waiting for mem writes to have up to date info
1571     * about which query is running.
1572     */
1573    emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1574 
1575    if (cmdbuf->state.pass) {
1576       tu_cond_exec_end(cs);
1577    }
1578 
1579    if (cmdbuf->state.pass)
1580       cs = &cmdbuf->draw_epilogue_cs;
1581 
1582    /* Set the availability to 1 */
1583    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1584    tu_cs_emit_qw(cs, available_iova);
1585    tu_cs_emit_qw(cs, 0x1);
1586 }
1587 
1588 /* Implement this bit of spec text from section 17.2 "Query Operation":
1589  *
1590  *     If queries are used while executing a render pass instance that has
1591  *     multiview enabled, the query uses N consecutive query indices in the
1592  *     query pool (starting at query) where N is the number of bits set in the
1593  *     view mask in the subpass the query is used in. How the numerical
1594  *     results of the query are distributed among the queries is
1595  *     implementation-dependent. For example, some implementations may write
1596  *     each view’s results to a distinct query, while other implementations
1597  *     may write the total result to the first query and write zero to the
1598  *     other queries. However, the sum of the results in all the queries must
1599  *     accurately reflect the total result of the query summed over all views.
1600  *     Applications can sum the results from all the queries to compute the
1601  *     total result.
1602  *
1603  * Since we execute all views at once, we write zero to the other queries.
1604  * Furthermore, because queries must be reset before use, and we set the
1605  * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
1606  */
1607 
1608 static void
handle_multiview_queries(struct tu_cmd_buffer * cmd,struct tu_query_pool * pool,uint32_t query)1609 handle_multiview_queries(struct tu_cmd_buffer *cmd,
1610                          struct tu_query_pool *pool,
1611                          uint32_t query)
1612 {
1613    if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
1614       return;
1615 
1616    unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
1617    struct tu_cs *cs = &cmd->draw_epilogue_cs;
1618 
1619    for (uint32_t i = 1; i < views; i++) {
1620       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1621       tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
1622       tu_cs_emit_qw(cs, 0x1);
1623    }
1624 }
1625 
1626 template <chip CHIP>
1627 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1628 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
1629                          VkQueryPool queryPool,
1630                          uint32_t query,
1631                          uint32_t index)
1632 {
1633    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1634    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1635    assert(query < pool->size);
1636 
1637    switch (pool->vk.query_type) {
1638    case VK_QUERY_TYPE_OCCLUSION:
1639       emit_end_occlusion_query<CHIP>(cmdbuf, pool, query);
1640       break;
1641    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1642       assert(index <= 4);
1643       emit_end_xfb_query<CHIP>(cmdbuf, pool, query, index);
1644       break;
1645    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1646       emit_end_prim_generated_query<CHIP>(cmdbuf, pool, query);
1647       break;
1648    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1649       emit_end_perf_query(cmdbuf, pool, query);
1650       break;
1651    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1652       emit_end_stat_query<CHIP>(cmdbuf, pool, query);
1653       break;
1654    case VK_QUERY_TYPE_TIMESTAMP:
1655       unreachable("Unimplemented query type");
1656    default:
1657       assert(!"Invalid query type");
1658    }
1659 
1660    handle_multiview_queries(cmdbuf, pool, query);
1661 }
1662 TU_GENX(tu_CmdEndQueryIndexedEXT);
1663 
1664 VKAPI_ATTR void VKAPI_CALL
tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits2 pipelineStage,VkQueryPool queryPool,uint32_t query)1665 tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
1666                       VkPipelineStageFlagBits2 pipelineStage,
1667                       VkQueryPool queryPool,
1668                       uint32_t query)
1669 {
1670    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1671    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1672 
1673    /* Inside a render pass, just write the timestamp multiple times so that
1674     * the user gets the last one if we use GMEM. There isn't really much
1675     * better we can do, and this seems to be what the blob does too.
1676     */
1677    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
1678 
1679    /* Stages that will already have been executed by the time the CP executes
1680     * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1681     * indirect stage counts as top-of-pipe too.
1682     */
1683    VkPipelineStageFlags2 top_of_pipe_flags =
1684       VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1685       VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
1686 
1687    if (pipelineStage & ~top_of_pipe_flags) {
1688       /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1689        * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1690        * complete.
1691        *
1692        * Stalling the CP like this is really unfortunate, but I don't think
1693        * there's a better solution that allows all 48 bits of precision
1694        * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1695        */
1696       tu_cs_emit_wfi(cs);
1697    }
1698 
1699    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1700    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) |
1701                   CP_REG_TO_MEM_0_CNT(2) |
1702                   CP_REG_TO_MEM_0_64B);
1703    tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1704 
1705    /* Only flag availability once the entire renderpass is done, similar to
1706     * the begin/end path.
1707     */
1708    cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
1709 
1710    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1711    tu_cs_emit_qw(cs, query_available_iova(pool, query));
1712    tu_cs_emit_qw(cs, 0x1);
1713 
1714    /* From the spec for vkCmdWriteTimestamp:
1715     *
1716     *    If vkCmdWriteTimestamp is called while executing a render pass
1717     *    instance that has multiview enabled, the timestamp uses N consecutive
1718     *    query indices in the query pool (starting at query) where N is the
1719     *    number of bits set in the view mask of the subpass the command is
1720     *    executed in. The resulting query values are determined by an
1721     *    implementation-dependent choice of one of the following behaviors:
1722     *
1723     *    -   The first query is a timestamp value and (if more than one bit is
1724     *        set in the view mask) zero is written to the remaining queries.
1725     *        If two timestamps are written in the same subpass, the sum of the
1726     *        execution time of all views between those commands is the
1727     *        difference between the first query written by each command.
1728     *
1729     *    -   All N queries are timestamp values. If two timestamps are written
1730     *        in the same subpass, the sum of the execution time of all views
1731     *        between those commands is the sum of the difference between
1732     *        corresponding queries written by each command. The difference
1733     *        between corresponding queries may be the execution time of a
1734     *        single view.
1735     *
1736     * We execute all views in the same draw call, so we implement the first
1737     * option, the same as regular queries.
1738     */
1739    handle_multiview_queries(cmd, pool, query);
1740 }
1741 
1742 VKAPI_ATTR void VKAPI_CALL
tu_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,uint32_t accelerationStructureCount,const VkAccelerationStructureKHR * pAccelerationStructures,VkQueryType queryType,VkQueryPool queryPool,uint32_t firstQuery)1743 tu_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,
1744                                                uint32_t accelerationStructureCount,
1745                                                const VkAccelerationStructureKHR *pAccelerationStructures,
1746                                                VkQueryType queryType,
1747                                                VkQueryPool queryPool,
1748                                                uint32_t firstQuery)
1749 {
1750    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1751    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1752 
1753    struct tu_cs *cs = &cmd->cs;
1754 
1755    /* Flush any AS builds */
1756    tu_emit_cache_flush<A7XX>(cmd);
1757 
1758    for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
1759       uint32_t query = i + firstQuery;
1760 
1761       VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pAccelerationStructures[i]);
1762       uint64_t va = vk_acceleration_structure_get_va(accel_struct);
1763 
1764       switch (queryType) {
1765       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
1766          va += offsetof(struct tu_accel_struct_header, compacted_size);
1767          break;
1768       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
1769          va += offsetof(struct tu_accel_struct_header, serialization_size);
1770          break;
1771       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
1772          va += offsetof(struct tu_accel_struct_header, instance_count);
1773          break;
1774       case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
1775          va += offsetof(struct tu_accel_struct_header, size);
1776          break;
1777       default:
1778          unreachable("Unhandle accel struct query type.");
1779       }
1780 
1781       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
1782       tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE);
1783       tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1784       tu_cs_emit_qw(cs, va);
1785 
1786       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1787       tu_cs_emit_qw(cs, query_available_iova(pool, query));
1788       tu_cs_emit_qw(cs, 0x1);
1789    }
1790 }
1791 
1792 VKAPI_ATTR VkResult VKAPI_CALL
tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1793 tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1794     VkPhysicalDevice                            physicalDevice,
1795     uint32_t                                    queueFamilyIndex,
1796     uint32_t*                                   pCounterCount,
1797     VkPerformanceCounterKHR*                    pCounters,
1798     VkPerformanceCounterDescriptionKHR*         pCounterDescriptions)
1799 {
1800    VK_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1801 
1802    uint32_t desc_count = *pCounterCount;
1803    uint32_t group_count;
1804    const struct fd_perfcntr_group *group =
1805          fd_perfcntrs(&phydev->dev_id, &group_count);
1806 
1807    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
1808    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
1809                           pCounterDescriptions, &desc_count);
1810 
1811    for (int i = 0; i < group_count; i++) {
1812       for (int j = 0; j < group[i].num_countables; j++) {
1813 
1814          vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1815             counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
1816             counter->unit =
1817                   fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
1818             counter->storage =
1819                   fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
1820 
1821             unsigned char sha1_result[20];
1822             _mesa_sha1_compute(group[i].countables[j].name,
1823                                strlen(group[i].countables[j].name),
1824                                sha1_result);
1825             memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1826          }
1827 
1828          vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
1829             desc->flags = 0;
1830 
1831             snprintf(desc->name, sizeof(desc->name),
1832                      "%s", group[i].countables[j].name);
1833             snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
1834             snprintf(desc->description, sizeof(desc->description),
1835                      "%s: %s performance counter",
1836                      group[i].name, group[i].countables[j].name);
1837          }
1838       }
1839    }
1840 
1841    return vk_outarray_status(&out);
1842 }
1843 
1844 VKAPI_ATTR void VKAPI_CALL
tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1845 tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1846       VkPhysicalDevice                            physicalDevice,
1847       const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
1848       uint32_t*                                   pNumPasses)
1849 {
1850    VK_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1851    uint32_t group_count = 0;
1852    uint32_t gid = 0, cid = 0, n_passes;
1853    const struct fd_perfcntr_group *group =
1854          fd_perfcntrs(&phydev->dev_id, &group_count);
1855 
1856    uint32_t counters_requested[group_count];
1857    memset(counters_requested, 0x0, sizeof(counters_requested));
1858    *pNumPasses = 1;
1859 
1860    for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
1861       perfcntr_index(group, group_count,
1862                      pPerformanceQueryCreateInfo->pCounterIndices[i],
1863                      &gid, &cid);
1864 
1865       counters_requested[gid]++;
1866    }
1867 
1868    for (uint32_t i = 0; i < group_count; i++) {
1869       n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
1870       *pNumPasses = MAX2(*pNumPasses, n_passes);
1871    }
1872 }
1873 
1874 VKAPI_ATTR VkResult VKAPI_CALL
tu_AcquireProfilingLockKHR(VkDevice device,const VkAcquireProfilingLockInfoKHR * pInfo)1875 tu_AcquireProfilingLockKHR(VkDevice device,
1876                            const VkAcquireProfilingLockInfoKHR* pInfo)
1877 {
1878    /* TODO. Probably there's something to do for kgsl. */
1879    return VK_SUCCESS;
1880 }
1881 
1882 VKAPI_ATTR void VKAPI_CALL
tu_ReleaseProfilingLockKHR(VkDevice device)1883 tu_ReleaseProfilingLockKHR(VkDevice device)
1884 {
1885    /* TODO. Probably there's something to do for kgsl. */
1886    return;
1887 }
1888