• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  *
5  * Based on anv:
6  * Copyright © 2015 Intel Corporation
7  */
8 
9 #include "tu_query.h"
10 
11 #include <fcntl.h>
12 
13 #include "nir/nir_builder.h"
14 #include "util/os_time.h"
15 
16 #include "vk_util.h"
17 
18 #include "tu_cmd_buffer.h"
19 #include "tu_cs.h"
20 #include "tu_device.h"
21 
22 #include "common/freedreno_gpu_event.h"
23 
24 #define NSEC_PER_SEC 1000000000ull
25 #define WAIT_TIMEOUT 5
26 #define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
27 
28 struct PACKED query_slot {
29    uint64_t available;
30 };
31 
32 struct PACKED occlusion_slot_value {
33    /* Seems sample counters are placed to be 16-byte aligned
34     * even though this query needs an 8-byte slot. */
35    uint64_t value;
36    uint64_t _padding;
37 };
38 
39 struct PACKED occlusion_query_slot {
40    struct query_slot common;
41    uint64_t result;
42 
43    struct occlusion_slot_value begin;
44    struct occlusion_slot_value end;
45 };
46 
47 struct PACKED timestamp_query_slot {
48    struct query_slot common;
49    uint64_t result;
50 };
51 
52 struct PACKED primitive_slot_value {
53    uint64_t values[2];
54 };
55 
56 struct PACKED pipeline_stat_query_slot {
57    struct query_slot common;
58    uint64_t results[STAT_COUNT];
59 
60    uint64_t begin[STAT_COUNT];
61    uint64_t end[STAT_COUNT];
62 };
63 
64 struct PACKED primitive_query_slot {
65    struct query_slot common;
66    /* The result of transform feedback queries is two integer values:
67     *   results[0] is the count of primitives written,
68     *   results[1] is the count of primitives generated.
69     * Also a result for each stream is stored at 4 slots respectively.
70     */
71    uint64_t results[2];
72 
73    /* Primitive counters also need to be 16-byte aligned. */
74    uint64_t _padding;
75 
76    struct primitive_slot_value begin[4];
77    struct primitive_slot_value end[4];
78 };
79 
80 struct PACKED perfcntr_query_slot {
81    uint64_t result;
82    uint64_t begin;
83    uint64_t end;
84 };
85 
86 struct PACKED perf_query_slot {
87    struct query_slot common;
88    struct perfcntr_query_slot perfcntr;
89 };
90 
91 struct PACKED primitives_generated_query_slot {
92    struct query_slot common;
93    uint64_t result;
94    uint64_t begin;
95    uint64_t end;
96 };
97 
98 /* Returns the IOVA of a given uint64_t field in a given slot of a query
99  * pool. */
100 #define query_iova(type, pool, query, field)                         \
101    pool->bo->iova + pool->stride * (query) + offsetof(type, field)
102 
103 #define occlusion_query_iova(pool, query, field)                     \
104    query_iova(struct occlusion_query_slot, pool, query, field)
105 
106 #define pipeline_stat_query_iova(pool, query, field, idx)                    \
107    pool->bo->iova + pool->stride * (query) +                                 \
108       offsetof_arr(struct pipeline_stat_query_slot, field, (idx))
109 
110 #define primitive_query_iova(pool, query, field, stream_id, i)               \
111    query_iova(struct primitive_query_slot, pool, query, field) +             \
112       sizeof_field(struct primitive_query_slot, field[0]) * (stream_id) +    \
113       offsetof_arr(struct primitive_slot_value, values, (i))
114 
115 #define perf_query_iova(pool, query, field, i)                          \
116    pool->bo->iova + pool->stride * (query) +                             \
117    sizeof(struct query_slot) +                                   \
118    sizeof(struct perfcntr_query_slot) * (i) +                          \
119    offsetof(struct perfcntr_query_slot, field)
120 
121 #define primitives_generated_query_iova(pool, query, field)               \
122    query_iova(struct primitives_generated_query_slot, pool, query, field)
123 
124 #define query_available_iova(pool, query)                            \
125    query_iova(struct query_slot, pool, query, available)
126 
127 #define query_result_iova(pool, query, type, i)                            \
128    pool->bo->iova + pool->stride * (query) +                          \
129    sizeof(struct query_slot) + sizeof(type) * (i)
130 
131 #define query_result_addr(pool, query, type, i)                              \
132    (uint64_t *) ((char *) pool->bo->map + pool->stride * (query) +           \
133                  sizeof(struct query_slot) + sizeof(type) * (i))
134 
135 #define query_is_available(slot) slot->available
136 
137 static const VkPerformanceCounterUnitKHR
138 fd_perfcntr_type_to_vk_unit[] = {
139    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
140    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
141    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
142    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
143    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
144    /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
145    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
146    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
147    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
148    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
149    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
150    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
151    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
152 };
153 
154 /* TODO. Basically this comes from the freedreno implementation where
155  * only UINT64 is used. We'd better confirm this by the blob vulkan driver
156  * when it starts supporting perf query.
157  */
158 static const VkPerformanceCounterStorageKHR
159 fd_perfcntr_type_to_vk_storage[] = {
160    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
161    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
162    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
163    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
164    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
165    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
166    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
167    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
168    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
169    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
170    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
171    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
172 };
173 
174 /*
175  * Returns a pointer to a given slot in a query pool.
176  */
177 static struct query_slot *
slot_address(struct tu_query_pool * pool,uint32_t query)178 slot_address(struct tu_query_pool *pool, uint32_t query)
179 {
180    return (struct query_slot *) ((char *) pool->bo->map +
181                                  query * pool->stride);
182 }
183 
184 static void
perfcntr_index(const struct fd_perfcntr_group * group,uint32_t group_count,uint32_t index,uint32_t * gid,uint32_t * cid)185 perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
186                uint32_t index, uint32_t *gid, uint32_t *cid)
187 
188 {
189    uint32_t i;
190 
191    for (i = 0; i < group_count; i++) {
192       if (group[i].num_countables > index) {
193          *gid = i;
194          *cid = index;
195          break;
196       }
197       index -= group[i].num_countables;
198    }
199 
200    assert(i < group_count);
201 }
202 
203 static int
compare_perfcntr_pass(const void * a,const void * b)204 compare_perfcntr_pass(const void *a, const void *b)
205 {
206    return ((struct tu_perf_query_data *)a)->pass -
207           ((struct tu_perf_query_data *)b)->pass;
208 }
209 
210 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)211 tu_CreateQueryPool(VkDevice _device,
212                    const VkQueryPoolCreateInfo *pCreateInfo,
213                    const VkAllocationCallbacks *pAllocator,
214                    VkQueryPool *pQueryPool)
215 {
216    TU_FROM_HANDLE(tu_device, device, _device);
217    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
218    assert(pCreateInfo->queryCount > 0);
219 
220    uint32_t pool_size, slot_size;
221    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
222 
223    pool_size = sizeof(struct tu_query_pool);
224 
225    switch (pCreateInfo->queryType) {
226    case VK_QUERY_TYPE_OCCLUSION:
227       slot_size = sizeof(struct occlusion_query_slot);
228       break;
229    case VK_QUERY_TYPE_TIMESTAMP:
230       slot_size = sizeof(struct timestamp_query_slot);
231       break;
232    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
233       slot_size = sizeof(struct primitive_query_slot);
234       break;
235    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
236       slot_size = sizeof(struct primitives_generated_query_slot);
237       break;
238    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
239       perf_query_info =
240             vk_find_struct_const(pCreateInfo->pNext,
241                                  QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
242       assert(perf_query_info);
243 
244       slot_size = sizeof(struct perf_query_slot) +
245                   sizeof(struct perfcntr_query_slot) *
246                   (perf_query_info->counterIndexCount - 1);
247 
248       /* Size of the array pool->tu_perf_query_data */
249       pool_size += sizeof(struct tu_perf_query_data) *
250                    perf_query_info->counterIndexCount;
251       break;
252    }
253    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
254       slot_size = sizeof(struct pipeline_stat_query_slot);
255       break;
256    default:
257       unreachable("Invalid query type");
258    }
259 
260    struct tu_query_pool *pool = (struct tu_query_pool *)
261          vk_object_alloc(&device->vk, pAllocator, pool_size,
262                          VK_OBJECT_TYPE_QUERY_POOL);
263    if (!pool)
264       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
265 
266    if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
267       pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
268                                       &pool->perf_group_count);
269 
270       pool->counter_index_count = perf_query_info->counterIndexCount;
271 
272       /* Build all perf counters data that is requested, so we could get
273        * correct group id, countable id, counter register and pass index with
274        * only a counter index provided by applications at each command submit.
275        *
276        * Also, since this built data will be sorted by pass index later, we
277        * should keep the original indices and store perfcntrs results according
278        * to them so apps can get correct results with their own indices.
279        */
280       uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
281       memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
282       memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
283 
284       for (uint32_t i = 0; i < pool->counter_index_count; i++) {
285          uint32_t gid = 0, cid = 0;
286 
287          perfcntr_index(pool->perf_group, pool->perf_group_count,
288                         perf_query_info->pCounterIndices[i], &gid, &cid);
289 
290          pool->perf_query_data[i].gid = gid;
291          pool->perf_query_data[i].cid = cid;
292          pool->perf_query_data[i].app_idx = i;
293 
294          /* When a counter register is over the capacity(num_counters),
295           * reset it for next pass.
296           */
297          if (regs[gid] < pool->perf_group[gid].num_counters) {
298             pool->perf_query_data[i].cntr_reg = regs[gid]++;
299             pool->perf_query_data[i].pass = pass[gid];
300          } else {
301             pool->perf_query_data[i].pass = ++pass[gid];
302             pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
303             regs[gid]++;
304          }
305       }
306 
307       /* Sort by pass index so we could easily prepare a command stream
308        * with the ascending order of pass index.
309        */
310       qsort(pool->perf_query_data, pool->counter_index_count,
311             sizeof(pool->perf_query_data[0]),
312             compare_perfcntr_pass);
313    }
314 
315    VkResult result = tu_bo_init_new(device, &pool->bo,
316          pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
317    if (result != VK_SUCCESS) {
318       vk_object_free(&device->vk, pAllocator, pool);
319       return result;
320    }
321 
322    result = tu_bo_map(device, pool->bo);
323    if (result != VK_SUCCESS) {
324       tu_bo_finish(device, pool->bo);
325       vk_object_free(&device->vk, pAllocator, pool);
326       return result;
327    }
328 
329    /* Initialize all query statuses to unavailable */
330    memset(pool->bo->map, 0, pool->bo->size);
331 
332    pool->type = pCreateInfo->queryType;
333    pool->stride = slot_size;
334    pool->size = pCreateInfo->queryCount;
335    pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
336    *pQueryPool = tu_query_pool_to_handle(pool);
337 
338    return VK_SUCCESS;
339 }
340 
341 VKAPI_ATTR void VKAPI_CALL
tu_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)342 tu_DestroyQueryPool(VkDevice _device,
343                     VkQueryPool _pool,
344                     const VkAllocationCallbacks *pAllocator)
345 {
346    TU_FROM_HANDLE(tu_device, device, _device);
347    TU_FROM_HANDLE(tu_query_pool, pool, _pool);
348 
349    if (!pool)
350       return;
351 
352    tu_bo_finish(device, pool->bo);
353    vk_object_free(&device->vk, pAllocator, pool);
354 }
355 
356 static uint32_t
get_result_count(struct tu_query_pool * pool)357 get_result_count(struct tu_query_pool *pool)
358 {
359    switch (pool->type) {
360    /* Occulusion and timestamp queries write one integer value */
361    case VK_QUERY_TYPE_OCCLUSION:
362    case VK_QUERY_TYPE_TIMESTAMP:
363    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
364       return 1;
365    /* Transform feedback queries write two integer values */
366    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
367       return 2;
368    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
369       return util_bitcount(pool->pipeline_statistics);
370    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
371       return pool->counter_index_count;
372    default:
373       assert(!"Invalid query type");
374       return 0;
375    }
376 }
377 
378 static uint32_t
statistics_index(uint32_t * statistics)379 statistics_index(uint32_t *statistics)
380 {
381    uint32_t stat;
382    stat = u_bit_scan(statistics);
383 
384    switch (1 << stat) {
385    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
386    case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
387       return 0;
388    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
389       return 1;
390    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
391       return 2;
392    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
393       return 4;
394    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
395       return 5;
396    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
397       return 6;
398    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
399       return 7;
400    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
401       return 8;
402    case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
403       return 9;
404    case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
405       return 10;
406    default:
407       return 0;
408    }
409 }
410 
411 static bool
is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)412 is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
413 {
414    return pipeline_statistics &
415           (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
416            VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
417            VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
418            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
419            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
420            VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
421            VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
422            VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
423            VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT);
424 }
425 
426 static bool
is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)427 is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
428 {
429    return pipeline_statistics &
430           VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
431 }
432 
433 static bool
is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)434 is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
435 {
436    return pipeline_statistics &
437           VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
438 }
439 
440 /* Wait on the the availability status of a query up until a timeout. */
441 static VkResult
wait_for_available(struct tu_device * device,struct tu_query_pool * pool,uint32_t query)442 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
443                    uint32_t query)
444 {
445    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
446     * scheduler friendly way instead of busy polling once the patch has landed
447     * upstream. */
448    struct query_slot *slot = slot_address(pool, query);
449    uint64_t abs_timeout = os_time_get_absolute_timeout(
450          WAIT_TIMEOUT * NSEC_PER_SEC);
451    while(os_time_get_nano() < abs_timeout) {
452       if (query_is_available(slot))
453          return VK_SUCCESS;
454    }
455    return vk_error(device, VK_TIMEOUT);
456 }
457 
458 /* Writes a query value to a buffer from the CPU. */
459 static void
write_query_value_cpu(char * base,uint32_t offset,uint64_t value,VkQueryResultFlags flags)460 write_query_value_cpu(char* base,
461                       uint32_t offset,
462                       uint64_t value,
463                       VkQueryResultFlags flags)
464 {
465    if (flags & VK_QUERY_RESULT_64_BIT) {
466       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
467    } else {
468       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
469    }
470 }
471 
472 static VkResult
get_query_pool_results(struct tu_device * device,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)473 get_query_pool_results(struct tu_device *device,
474                        struct tu_query_pool *pool,
475                        uint32_t firstQuery,
476                        uint32_t queryCount,
477                        size_t dataSize,
478                        void *pData,
479                        VkDeviceSize stride,
480                        VkQueryResultFlags flags)
481 {
482    assert(dataSize >= stride * queryCount);
483 
484    char *result_base = (char *) pData;
485    VkResult result = VK_SUCCESS;
486    for (uint32_t i = 0; i < queryCount; i++) {
487       uint32_t query = firstQuery + i;
488       struct query_slot *slot = slot_address(pool, query);
489       bool available = query_is_available(slot);
490       uint32_t result_count = get_result_count(pool);
491       uint32_t statistics = pool->pipeline_statistics;
492 
493       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
494          VkResult wait_result = wait_for_available(device, pool, query);
495          if (wait_result != VK_SUCCESS)
496             return wait_result;
497          available = true;
498       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
499          /* From the Vulkan 1.1.130 spec:
500           *
501           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
502           *    both not set then no result values are written to pData for
503           *    queries that are in the unavailable state at the time of the
504           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
505           *    availability state is still written to pData for those queries
506           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
507           */
508          result = VK_NOT_READY;
509          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
510             result_base += stride;
511             continue;
512          }
513       }
514 
515       for (uint32_t k = 0; k < result_count; k++) {
516          if (available) {
517             uint64_t *result;
518 
519             if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
520                uint32_t stat_idx = statistics_index(&statistics);
521                result = query_result_addr(pool, query, uint64_t, stat_idx);
522             } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
523                result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
524             } else {
525                result = query_result_addr(pool, query, uint64_t, k);
526             }
527 
528             write_query_value_cpu(result_base, k, *result, flags);
529          } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
530              /* From the Vulkan 1.1.130 spec:
531               *
532               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
533               *   is not set, and the query’s status is unavailable, an
534               *   intermediate result value between zero and the final result
535               *   value is written to pData for that query.
536               *
537               * Just return 0 here for simplicity since it's a valid result.
538               */
539             write_query_value_cpu(result_base, k, 0, flags);
540       }
541 
542       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
543          /* From the Vulkan 1.1.130 spec:
544           *
545           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
546           *    integer value written for each query is non-zero if the query’s
547           *    status was available or zero if the status was unavailable.
548           */
549          write_query_value_cpu(result_base, result_count, available, flags);
550 
551       result_base += stride;
552    }
553    return result;
554 }
555 
556 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)557 tu_GetQueryPoolResults(VkDevice _device,
558                        VkQueryPool queryPool,
559                        uint32_t firstQuery,
560                        uint32_t queryCount,
561                        size_t dataSize,
562                        void *pData,
563                        VkDeviceSize stride,
564                        VkQueryResultFlags flags)
565 {
566    TU_FROM_HANDLE(tu_device, device, _device);
567    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
568    assert(firstQuery + queryCount <= pool->size);
569 
570    if (vk_device_is_lost(&device->vk))
571       return VK_ERROR_DEVICE_LOST;
572 
573    switch (pool->type) {
574    case VK_QUERY_TYPE_OCCLUSION:
575    case VK_QUERY_TYPE_TIMESTAMP:
576    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
577    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
578    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
579    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
580       return get_query_pool_results(device, pool, firstQuery, queryCount,
581                                     dataSize, pData, stride, flags);
582    default:
583       assert(!"Invalid query type");
584    }
585    return VK_SUCCESS;
586 }
587 
588 /* Copies a query value from one buffer to another from the GPU. */
589 static void
copy_query_value_gpu(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,uint64_t src_iova,uint64_t base_write_iova,uint32_t offset,VkQueryResultFlags flags)590 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
591                      struct tu_cs *cs,
592                      uint64_t src_iova,
593                      uint64_t base_write_iova,
594                      uint32_t offset,
595                      VkQueryResultFlags flags) {
596    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
597          sizeof(uint64_t) : sizeof(uint32_t);
598    uint64_t write_iova = base_write_iova + (offset * element_size);
599 
600    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
601    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
602          CP_MEM_TO_MEM_0_DOUBLE : 0;
603    tu_cs_emit(cs, mem_to_mem_flags);
604    tu_cs_emit_qw(cs, write_iova);
605    tu_cs_emit_qw(cs, src_iova);
606 }
607 
608 template <chip CHIP>
609 static void
emit_copy_query_pool_results(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,struct tu_buffer * buffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)610 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
611                              struct tu_cs *cs,
612                              struct tu_query_pool *pool,
613                              uint32_t firstQuery,
614                              uint32_t queryCount,
615                              struct tu_buffer *buffer,
616                              VkDeviceSize dstOffset,
617                              VkDeviceSize stride,
618                              VkQueryResultFlags flags)
619 {
620    /* Flush cache for the buffer to copy to. */
621    tu_emit_cache_flush<CHIP>(cmdbuf);
622 
623    /* From the Vulkan 1.1.130 spec:
624     *
625     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
626     *    uses of vkCmdResetQueryPool in the same queue, without any additional
627     *    synchronization.
628     *
629     * To ensure that previous writes to the available bit are coherent, first
630     * wait for all writes to complete.
631     */
632    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
633 
634    for (uint32_t i = 0; i < queryCount; i++) {
635       uint32_t query = firstQuery + i;
636       uint64_t available_iova = query_available_iova(pool, query);
637       uint64_t buffer_iova = buffer->iova + dstOffset + i * stride;
638       uint32_t result_count = get_result_count(pool);
639       uint32_t statistics = pool->pipeline_statistics;
640 
641       /* Wait for the available bit to be set if executed with the
642        * VK_QUERY_RESULT_WAIT_BIT flag. */
643       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
644          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
645          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
646                         CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
647          tu_cs_emit_qw(cs, available_iova);
648          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
649          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
650          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
651       }
652 
653       for (uint32_t k = 0; k < result_count; k++) {
654          uint64_t result_iova;
655 
656          if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
657             uint32_t stat_idx = statistics_index(&statistics);
658             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
659          } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
660             result_iova = query_result_iova(pool, query,
661                                             struct perfcntr_query_slot, k);
662          } else {
663             result_iova = query_result_iova(pool, query, uint64_t, k);
664          }
665 
666          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
667             /* Unconditionally copying the bo->result into the buffer here is
668              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
669              * if the query is unavailable, this will copy the correct partial
670              * value of 0.
671              */
672             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
673                                  k /* offset */, flags);
674          } else {
675             /* Conditionally copy bo->result into the buffer based on whether the
676              * query is available.
677              *
678              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
679              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
680              * that 0 < available < 2, aka available == 1.
681              */
682             tu_cs_reserve(cs, 7 + 6);
683             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
684             tu_cs_emit_qw(cs, available_iova);
685             tu_cs_emit_qw(cs, available_iova);
686             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
687             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
688 
689             /* Start of conditional execution */
690             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
691                               k /* offset */, flags);
692             /* End of conditional execution */
693          }
694       }
695 
696       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
697          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
698                               result_count /* offset */, flags);
699       }
700    }
701 }
702 
703 template <chip CHIP>
704 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)705 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
706                            VkQueryPool queryPool,
707                            uint32_t firstQuery,
708                            uint32_t queryCount,
709                            VkBuffer dstBuffer,
710                            VkDeviceSize dstOffset,
711                            VkDeviceSize stride,
712                            VkQueryResultFlags flags)
713 {
714    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
715    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
716    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
717    struct tu_cs *cs = &cmdbuf->cs;
718    assert(firstQuery + queryCount <= pool->size);
719 
720    switch (pool->type) {
721    case VK_QUERY_TYPE_OCCLUSION:
722    case VK_QUERY_TYPE_TIMESTAMP:
723    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
724    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
725    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
726       return emit_copy_query_pool_results<CHIP>(cmdbuf, cs, pool, firstQuery,
727                                                 queryCount, buffer, dstOffset,
728                                                 stride, flags);
729    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
730       unreachable("allowCommandBufferQueryCopies is false");
731    default:
732       assert(!"Invalid query type");
733    }
734 }
735 TU_GENX(tu_CmdCopyQueryPoolResults);
736 
737 static void
emit_reset_query_pool(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount)738 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
739                       struct tu_query_pool *pool,
740                       uint32_t firstQuery,
741                       uint32_t queryCount)
742 {
743    struct tu_cs *cs = &cmdbuf->cs;
744 
745    for (uint32_t i = 0; i < queryCount; i++) {
746       uint32_t query = firstQuery + i;
747       uint32_t statistics = pool->pipeline_statistics;
748 
749       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
750       tu_cs_emit_qw(cs, query_available_iova(pool, query));
751       tu_cs_emit_qw(cs, 0x0);
752 
753       for (uint32_t k = 0; k < get_result_count(pool); k++) {
754          uint64_t result_iova;
755 
756          if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
757             uint32_t stat_idx = statistics_index(&statistics);
758             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
759          } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
760             result_iova = query_result_iova(pool, query,
761                                             struct perfcntr_query_slot, k);
762          } else {
763             result_iova = query_result_iova(pool, query, uint64_t, k);
764          }
765 
766          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
767          tu_cs_emit_qw(cs, result_iova);
768          tu_cs_emit_qw(cs, 0x0);
769       }
770    }
771 
772 }
773 
774 VKAPI_ATTR void VKAPI_CALL
tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)775 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
776                      VkQueryPool queryPool,
777                      uint32_t firstQuery,
778                      uint32_t queryCount)
779 {
780    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
781    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
782 
783    switch (pool->type) {
784    case VK_QUERY_TYPE_TIMESTAMP:
785    case VK_QUERY_TYPE_OCCLUSION:
786    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
787    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
788    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
789    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
790       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
791       break;
792    default:
793       assert(!"Invalid query type");
794    }
795 }
796 
797 VKAPI_ATTR void VKAPI_CALL
tu_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)798 tu_ResetQueryPool(VkDevice device,
799                   VkQueryPool queryPool,
800                   uint32_t firstQuery,
801                   uint32_t queryCount)
802 {
803    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
804 
805    for (uint32_t i = 0; i < queryCount; i++) {
806       struct query_slot *slot = slot_address(pool, i + firstQuery);
807       slot->available = 0;
808 
809       for (uint32_t k = 0; k < get_result_count(pool); k++) {
810          uint64_t *res;
811 
812          if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
813             res = query_result_addr(pool, i + firstQuery,
814                                     struct perfcntr_query_slot, k);
815          } else {
816             res = query_result_addr(pool, i + firstQuery, uint64_t, k);
817          }
818 
819          *res = 0;
820       }
821    }
822 }
823 
824 template <chip CHIP>
825 static void
emit_begin_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)826 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
827                            struct tu_query_pool *pool,
828                            uint32_t query)
829 {
830    /* From the Vulkan 1.1.130 spec:
831     *
832     *    A query must begin and end inside the same subpass of a render pass
833     *    instance, or must both begin and end outside of a render pass
834     *    instance.
835     *
836     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
837     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
838     * query begins/ends inside the same subpass of a render pass, we need to
839     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
840     * is then run on every tile during render, so we just need to accumulate
841     * sample counts in slot->result to compute the query result.
842     */
843    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
844 
845    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
846 
847    tu_cs_emit_regs(cs,
848                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
849 
850    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
851       tu_cs_emit_regs(cs,
852                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
853       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
854       tu_cs_emit(cs, ZPASS_DONE);
855       if (CHIP == A7XX) {
856          /* Copied from blob's cmdstream, not sure why it is done. */
857          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
858          tu_cs_emit(cs, CCU_CLEAN_DEPTH);
859       }
860    } else {
861       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
862       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
863                                        .write_sample_count = true).value);
864       tu_cs_emit_qw(cs, begin_iova);
865    }
866 }
867 
868 template <chip CHIP>
869 static void
emit_begin_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)870 emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
871                       struct tu_query_pool *pool,
872                       uint32_t query)
873 {
874    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
875    uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin, 0);
876 
877    if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
878       bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
879       cmdbuf->state.prim_counters_running++;
880 
881       /* Prevent starting primitive counters when it is supposed to be stopped
882        * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
883        */
884       if (need_cond_exec) {
885          tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
886                         CP_COND_REG_EXEC_0_SYSMEM |
887                         CP_COND_REG_EXEC_0_BINNING);
888       }
889 
890       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
891 
892       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
893       tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
894       tu_cs_emit(cs, 0);
895 
896       if (need_cond_exec) {
897          tu_cond_exec_end(cs);
898       }
899    }
900 
901    if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
902       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_FRAGMENT_CTRS);
903    }
904 
905    if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
906       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_COMPUTE_CTRS);
907    }
908 
909    tu_cs_emit_wfi(cs);
910 
911    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
912    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
913                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
914                   CP_REG_TO_MEM_0_64B);
915    tu_cs_emit_qw(cs, begin_iova);
916 }
917 
918 static void
emit_perfcntrs_pass_start(struct tu_cs * cs,uint32_t pass)919 emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
920 {
921    tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
922    tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
923                         REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
924                   A6XX_CP_REG_TEST_0_BIT(pass) |
925                   A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
926    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
927 }
928 
929 static void
emit_begin_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)930 emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
931                            struct tu_query_pool *pool,
932                            uint32_t query)
933 {
934    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
935    uint32_t last_pass = ~0;
936 
937    if (cmdbuf->state.pass) {
938       cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
939    }
940 
941    /* Querying perf counters happens in these steps:
942     *
943     *  0) There's a scratch reg to set a pass index for perf counters query.
944     *     Prepare cmd streams to set each pass index to the reg at device
945     *     creation time. See tu_CreateDevice in tu_device.c
946     *  1) Emit command streams to read all requested perf counters at all
947     *     passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
948     *     reads the scratch reg where pass index is set.
949     *     See emit_perfcntrs_pass_start.
950     *  2) Pick the right cs setting proper pass index to the reg and prepend
951     *     it to the command buffer at each submit time.
952     *     See tu_QueueSubmit in tu_drm.c
953     *  3) If the pass index in the reg is true, then executes the command
954     *     stream below CP_COND_REG_EXEC.
955     */
956 
957    tu_cs_emit_wfi(cs);
958 
959    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
960       struct tu_perf_query_data *data = &pool->perf_query_data[i];
961 
962       if (last_pass != data->pass) {
963          last_pass = data->pass;
964 
965          if (data->pass != 0)
966             tu_cond_exec_end(cs);
967          emit_perfcntrs_pass_start(cs, data->pass);
968       }
969 
970       const struct fd_perfcntr_counter *counter =
971             &pool->perf_group[data->gid].counters[data->cntr_reg];
972       const struct fd_perfcntr_countable *countable =
973             &pool->perf_group[data->gid].countables[data->cid];
974 
975       tu_cs_emit_pkt4(cs, counter->select_reg, 1);
976       tu_cs_emit(cs, countable->selector);
977    }
978    tu_cond_exec_end(cs);
979 
980    last_pass = ~0;
981    tu_cs_emit_wfi(cs);
982 
983    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
984       struct tu_perf_query_data *data = &pool->perf_query_data[i];
985 
986       if (last_pass != data->pass) {
987          last_pass = data->pass;
988 
989          if (data->pass != 0)
990             tu_cond_exec_end(cs);
991          emit_perfcntrs_pass_start(cs, data->pass);
992       }
993 
994       const struct fd_perfcntr_counter *counter =
995             &pool->perf_group[data->gid].counters[data->cntr_reg];
996 
997       uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
998 
999       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1000       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1001                      CP_REG_TO_MEM_0_64B);
1002       tu_cs_emit_qw(cs, begin_iova);
1003    }
1004    tu_cond_exec_end(cs);
1005 }
1006 
1007 template <chip CHIP>
1008 static void
emit_begin_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1009 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
1010                      struct tu_query_pool *pool,
1011                      uint32_t query,
1012                      uint32_t stream_id)
1013 {
1014    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1015    uint64_t begin_iova = primitive_query_iova(pool, query, begin, 0, 0);
1016 
1017    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
1018    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1019 }
1020 
1021 template <chip CHIP>
1022 static void
emit_begin_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1023 emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1024                                 struct tu_query_pool *pool,
1025                                 uint32_t query)
1026 {
1027    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1028    uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1029 
1030    if (cmdbuf->state.pass) {
1031       cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
1032    } else {
1033       cmdbuf->state.prim_generated_query_running_before_rp = true;
1034    }
1035 
1036    cmdbuf->state.prim_counters_running++;
1037 
1038    if (cmdbuf->state.pass) {
1039       /* Primitives that passed all tests are still counted in in each
1040        * tile even with HW binning beforehand. Do not permit it.
1041        */
1042       tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1043                            CP_COND_REG_EXEC_0_SYSMEM |
1044                            CP_COND_REG_EXEC_0_BINNING);
1045    }
1046 
1047    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
1048 
1049    tu_cs_emit_wfi(cs);
1050 
1051    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1052    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1053                   CP_REG_TO_MEM_0_CNT(2) |
1054                   CP_REG_TO_MEM_0_64B);
1055    tu_cs_emit_qw(cs, begin_iova);
1056 
1057    if (cmdbuf->state.pass) {
1058       tu_cond_exec_end(cs);
1059    }
1060 }
1061 
1062 template <chip CHIP>
1063 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1064 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
1065                  VkQueryPool queryPool,
1066                  uint32_t query,
1067                  VkQueryControlFlags flags)
1068 {
1069    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1070    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1071    assert(query < pool->size);
1072 
1073    switch (pool->type) {
1074    case VK_QUERY_TYPE_OCCLUSION:
1075       /* In freedreno, there is no implementation difference between
1076        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
1077        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
1078        */
1079       emit_begin_occlusion_query<CHIP>(cmdbuf, pool, query);
1080       break;
1081    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1082       emit_begin_xfb_query<CHIP>(cmdbuf, pool, query, 0);
1083       break;
1084    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1085       emit_begin_prim_generated_query<CHIP>(cmdbuf, pool, query);
1086       break;
1087    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1088       emit_begin_perf_query(cmdbuf, pool, query);
1089       break;
1090    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1091       emit_begin_stat_query<CHIP>(cmdbuf, pool, query);
1092       break;
1093    case VK_QUERY_TYPE_TIMESTAMP:
1094       unreachable("Unimplemented query type");
1095    default:
1096       assert(!"Invalid query type");
1097    }
1098 }
1099 TU_GENX(tu_CmdBeginQuery);
1100 
1101 template <chip CHIP>
1102 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)1103 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
1104                            VkQueryPool queryPool,
1105                            uint32_t query,
1106                            VkQueryControlFlags flags,
1107                            uint32_t index)
1108 {
1109    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1110    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1111    assert(query < pool->size);
1112 
1113    switch (pool->type) {
1114    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1115       emit_begin_xfb_query<CHIP>(cmdbuf, pool, query, index);
1116       break;
1117    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1118       emit_begin_prim_generated_query<CHIP>(cmdbuf, pool, query);
1119       break;
1120    default:
1121       assert(!"Invalid query type");
1122    }
1123 }
1124 TU_GENX(tu_CmdBeginQueryIndexedEXT);
1125 
1126 template <chip CHIP>
1127 static void
emit_end_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1128 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
1129                          struct tu_query_pool *pool,
1130                          uint32_t query)
1131 {
1132    /* Ending an occlusion query happens in a few steps:
1133     *    1) Set the slot->end to UINT64_MAX.
1134     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
1135     *       write the current sample count value into slot->end.
1136     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
1137     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
1138     *    4) Accumulate the results of the query (slot->end - slot->begin) into
1139     *       slot->result.
1140     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
1141     *       pass, set the slot's available bit since the query is now done.
1142     *    6) If vkCmdEndQuery *is* called from within the scope of a render
1143     *       pass, we cannot mark as available yet since the commands in
1144     *       draw_cs are not run until vkCmdEndRenderPass.
1145     */
1146    const struct tu_render_pass *pass = cmdbuf->state.pass;
1147    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1148 
1149    uint64_t available_iova = query_available_iova(pool, query);
1150    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
1151    uint64_t end_iova = occlusion_query_iova(pool, query, end);
1152    uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0);
1153    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1154    tu_cs_emit_qw(cs, end_iova);
1155    tu_cs_emit_qw(cs, 0xffffffffffffffffull);
1156 
1157    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1158 
1159    tu_cs_emit_regs(cs,
1160                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
1161 
1162    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
1163       tu_cs_emit_regs(cs,
1164                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
1165       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1166       tu_cs_emit(cs, ZPASS_DONE);
1167       if (CHIP == A7XX) {
1168          /* Copied from blob's cmdstream, not sure why it is done. */
1169          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1170          tu_cs_emit(cs, CCU_CLEAN_DEPTH);
1171       }
1172    } else {
1173       /* A7XX TODO: Calculate (end - begin) via ZPASS_DONE. */
1174       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 3);
1175       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
1176                                        .write_sample_count = true).value);
1177       tu_cs_emit_qw(cs, end_iova);
1178    }
1179 
1180    tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1181    tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
1182                   CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
1183    tu_cs_emit_qw(cs, end_iova);
1184    tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
1185    tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1186    tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1187 
1188    /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
1189    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1190    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
1191    tu_cs_emit_qw(cs, result_iova);
1192    tu_cs_emit_qw(cs, result_iova);
1193    tu_cs_emit_qw(cs, end_iova);
1194    tu_cs_emit_qw(cs, begin_iova);
1195 
1196    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1197 
1198    if (pass)
1199       /* Technically, queries should be tracked per-subpass, but here we track
1200        * at the render pass level to simply the code a bit. This is safe
1201        * because the only commands that use the available bit are
1202        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
1203        * cannot be invoked from inside a render pass scope.
1204        */
1205       cs = &cmdbuf->draw_epilogue_cs;
1206 
1207    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1208    tu_cs_emit_qw(cs, available_iova);
1209    tu_cs_emit_qw(cs, 0x1);
1210 }
1211 
1212 /* PRIMITIVE_CTRS is used for two distinct queries:
1213  * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
1214  * - VK_QUERY_TYPE_PIPELINE_STATISTICS
1215  * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
1216  * only for outer query.
1217  *
1218  * Also, pipeline stat query could run outside of renderpass and prim gen
1219  * query inside of secondary cmd buffer - for such case we ought to track
1220  * the status of pipeline stats query.
1221  */
1222 template <chip CHIP>
1223 static void
emit_stop_primitive_ctrs(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,enum VkQueryType query_type)1224 emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
1225                          struct tu_cs *cs,
1226                          enum VkQueryType query_type)
1227 {
1228    bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
1229    cmdbuf->state.prim_counters_running--;
1230    if (cmdbuf->state.prim_counters_running == 0) {
1231       bool need_cond_exec =
1232          is_secondary &&
1233          query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
1234          is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
1235 
1236       if (!need_cond_exec) {
1237          tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1238       } else {
1239          tu_cs_reserve(cs, 7 + 2);
1240          /* Check that pipeline stats query is not running, only then
1241           * we count stop the counter.
1242           */
1243          tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
1244          tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1245          tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1246          tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
1247          tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
1248 
1249          tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1250       }
1251    }
1252 
1253    if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1254       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1255       tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1256       tu_cs_emit(cs, 1);
1257    }
1258 }
1259 
1260 template <chip CHIP>
1261 static void
emit_end_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1262 emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
1263                     struct tu_query_pool *pool,
1264                     uint32_t query)
1265 {
1266    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1267    uint64_t end_iova = pipeline_stat_query_iova(pool, query, end, 0);
1268    uint64_t available_iova = query_available_iova(pool, query);
1269    uint64_t result_iova;
1270    uint64_t stat_start_iova;
1271    uint64_t stat_stop_iova;
1272 
1273    if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
1274       /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
1275        * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
1276        * renderpass, because it is already stopped.
1277        */
1278       emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
1279    }
1280 
1281    if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
1282       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_FRAGMENT_CTRS);
1283    }
1284 
1285    if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
1286       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_COMPUTE_CTRS);
1287    }
1288 
1289    tu_cs_emit_wfi(cs);
1290 
1291    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1292    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
1293                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
1294                   CP_REG_TO_MEM_0_64B);
1295    tu_cs_emit_qw(cs, end_iova);
1296 
1297    for (int i = 0; i < STAT_COUNT; i++) {
1298       result_iova = query_result_iova(pool, query, uint64_t, i);
1299       stat_start_iova = pipeline_stat_query_iova(pool, query, begin, i);
1300       stat_stop_iova = pipeline_stat_query_iova(pool, query, end, i);
1301 
1302       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1303       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1304                      CP_MEM_TO_MEM_0_DOUBLE |
1305                      CP_MEM_TO_MEM_0_NEG_C);
1306 
1307       tu_cs_emit_qw(cs, result_iova);
1308       tu_cs_emit_qw(cs, result_iova);
1309       tu_cs_emit_qw(cs, stat_stop_iova);
1310       tu_cs_emit_qw(cs, stat_start_iova);
1311    }
1312 
1313    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1314 
1315    if (cmdbuf->state.pass)
1316       cs = &cmdbuf->draw_epilogue_cs;
1317 
1318    /* Set the availability to 1 */
1319    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1320    tu_cs_emit_qw(cs, available_iova);
1321    tu_cs_emit_qw(cs, 0x1);
1322 }
1323 
1324 static void
emit_end_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1325 emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
1326                          struct tu_query_pool *pool,
1327                          uint32_t query)
1328 {
1329    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1330    uint64_t available_iova = query_available_iova(pool, query);
1331    uint64_t end_iova;
1332    uint64_t begin_iova;
1333    uint64_t result_iova;
1334    uint32_t last_pass = ~0;
1335 
1336    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1337       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1338 
1339       if (last_pass != data->pass) {
1340          last_pass = data->pass;
1341 
1342          if (data->pass != 0)
1343             tu_cond_exec_end(cs);
1344          emit_perfcntrs_pass_start(cs, data->pass);
1345       }
1346 
1347       const struct fd_perfcntr_counter *counter =
1348             &pool->perf_group[data->gid].counters[data->cntr_reg];
1349 
1350       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1351 
1352       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1353       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1354                      CP_REG_TO_MEM_0_64B);
1355       tu_cs_emit_qw(cs, end_iova);
1356    }
1357    tu_cond_exec_end(cs);
1358 
1359    last_pass = ~0;
1360    tu_cs_emit_wfi(cs);
1361 
1362    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1363       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1364 
1365       if (last_pass != data->pass) {
1366          last_pass = data->pass;
1367 
1368 
1369          if (data->pass != 0)
1370             tu_cond_exec_end(cs);
1371          emit_perfcntrs_pass_start(cs, data->pass);
1372       }
1373 
1374       result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
1375              data->app_idx);
1376       begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1377       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1378 
1379       /* result += end - begin */
1380       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1381       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1382                      CP_MEM_TO_MEM_0_DOUBLE |
1383                      CP_MEM_TO_MEM_0_NEG_C);
1384 
1385       tu_cs_emit_qw(cs, result_iova);
1386       tu_cs_emit_qw(cs, result_iova);
1387       tu_cs_emit_qw(cs, end_iova);
1388       tu_cs_emit_qw(cs, begin_iova);
1389    }
1390    tu_cond_exec_end(cs);
1391 
1392    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1393 
1394    if (cmdbuf->state.pass)
1395       cs = &cmdbuf->draw_epilogue_cs;
1396 
1397    /* Set the availability to 1 */
1398    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1399    tu_cs_emit_qw(cs, available_iova);
1400    tu_cs_emit_qw(cs, 0x1);
1401 }
1402 
1403 template <chip CHIP>
1404 static void
emit_end_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1405 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
1406                    struct tu_query_pool *pool,
1407                    uint32_t query,
1408                    uint32_t stream_id)
1409 {
1410    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1411 
1412    uint64_t end_iova = primitive_query_iova(pool, query, end, 0, 0);
1413    uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
1414    uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
1415    uint64_t begin_written_iova = primitive_query_iova(pool, query, begin, stream_id, 0);
1416    uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin, stream_id, 1);
1417    uint64_t end_written_iova = primitive_query_iova(pool, query, end, stream_id, 0);
1418    uint64_t end_generated_iova = primitive_query_iova(pool, query, end, stream_id, 1);
1419    uint64_t available_iova = query_available_iova(pool, query);
1420 
1421    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
1422    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1423 
1424    tu_cs_emit_wfi(cs);
1425    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_FLUSH);
1426 
1427    /* Set the count of written primitives */
1428    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1429    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1430                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1431    tu_cs_emit_qw(cs, result_written_iova);
1432    tu_cs_emit_qw(cs, result_written_iova);
1433    tu_cs_emit_qw(cs, end_written_iova);
1434    tu_cs_emit_qw(cs, begin_written_iova);
1435 
1436    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_FLUSH);
1437 
1438    /* Set the count of generated primitives */
1439    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1440    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1441                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1442    tu_cs_emit_qw(cs, result_generated_iova);
1443    tu_cs_emit_qw(cs, result_generated_iova);
1444    tu_cs_emit_qw(cs, end_generated_iova);
1445    tu_cs_emit_qw(cs, begin_generated_iova);
1446 
1447    /* Set the availability to 1 */
1448    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1449    tu_cs_emit_qw(cs, available_iova);
1450    tu_cs_emit_qw(cs, 0x1);
1451 }
1452 
1453 template <chip CHIP>
1454 static void
emit_end_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1455 emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1456                               struct tu_query_pool *pool,
1457                               uint32_t query)
1458 {
1459    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1460 
1461    if (!cmdbuf->state.pass) {
1462       cmdbuf->state.prim_generated_query_running_before_rp = false;
1463    }
1464 
1465    uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1466    uint64_t end_iova = primitives_generated_query_iova(pool, query, end);
1467    uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
1468    uint64_t available_iova = query_available_iova(pool, query);
1469 
1470    if (cmdbuf->state.pass) {
1471       tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1472                              CP_COND_REG_EXEC_0_SYSMEM |
1473                              CP_COND_REG_EXEC_0_BINNING);
1474    }
1475 
1476    tu_cs_emit_wfi(cs);
1477 
1478    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1479    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1480                   CP_REG_TO_MEM_0_CNT(2) |
1481                   CP_REG_TO_MEM_0_64B);
1482    tu_cs_emit_qw(cs, end_iova);
1483 
1484    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1485    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1486                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
1487    tu_cs_emit_qw(cs, result_iova);
1488    tu_cs_emit_qw(cs, result_iova);
1489    tu_cs_emit_qw(cs, end_iova);
1490    tu_cs_emit_qw(cs, begin_iova);
1491 
1492    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1493 
1494    /* Should be after waiting for mem writes to have up to date info
1495     * about which query is running.
1496     */
1497    emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1498 
1499    if (cmdbuf->state.pass) {
1500       tu_cond_exec_end(cs);
1501    }
1502 
1503    if (cmdbuf->state.pass)
1504       cs = &cmdbuf->draw_epilogue_cs;
1505 
1506    /* Set the availability to 1 */
1507    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1508    tu_cs_emit_qw(cs, available_iova);
1509    tu_cs_emit_qw(cs, 0x1);
1510 }
1511 
1512 /* Implement this bit of spec text from section 17.2 "Query Operation":
1513  *
1514  *     If queries are used while executing a render pass instance that has
1515  *     multiview enabled, the query uses N consecutive query indices in the
1516  *     query pool (starting at query) where N is the number of bits set in the
1517  *     view mask in the subpass the query is used in. How the numerical
1518  *     results of the query are distributed among the queries is
1519  *     implementation-dependent. For example, some implementations may write
1520  *     each view’s results to a distinct query, while other implementations
1521  *     may write the total result to the first query and write zero to the
1522  *     other queries. However, the sum of the results in all the queries must
1523  *     accurately reflect the total result of the query summed over all views.
1524  *     Applications can sum the results from all the queries to compute the
1525  *     total result.
1526  *
1527  * Since we execute all views at once, we write zero to the other queries.
1528  * Furthermore, because queries must be reset before use, and we set the
1529  * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
1530  */
1531 
1532 static void
handle_multiview_queries(struct tu_cmd_buffer * cmd,struct tu_query_pool * pool,uint32_t query)1533 handle_multiview_queries(struct tu_cmd_buffer *cmd,
1534                          struct tu_query_pool *pool,
1535                          uint32_t query)
1536 {
1537    if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
1538       return;
1539 
1540    unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
1541    struct tu_cs *cs = &cmd->draw_epilogue_cs;
1542 
1543    for (uint32_t i = 1; i < views; i++) {
1544       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1545       tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
1546       tu_cs_emit_qw(cs, 0x1);
1547    }
1548 }
1549 
1550 template <chip CHIP>
1551 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1552 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
1553                VkQueryPool queryPool,
1554                uint32_t query)
1555 {
1556    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1557    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1558    assert(query < pool->size);
1559 
1560    switch (pool->type) {
1561    case VK_QUERY_TYPE_OCCLUSION:
1562       emit_end_occlusion_query<CHIP>(cmdbuf, pool, query);
1563       break;
1564    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1565       emit_end_xfb_query<CHIP>(cmdbuf, pool, query, 0);
1566       break;
1567    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1568       emit_end_prim_generated_query<CHIP>(cmdbuf, pool, query);
1569       break;
1570    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1571       emit_end_perf_query(cmdbuf, pool, query);
1572       break;
1573    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1574       emit_end_stat_query<CHIP>(cmdbuf, pool, query);
1575       break;
1576    case VK_QUERY_TYPE_TIMESTAMP:
1577       unreachable("Unimplemented query type");
1578    default:
1579       assert(!"Invalid query type");
1580    }
1581 
1582    handle_multiview_queries(cmdbuf, pool, query);
1583 }
1584 TU_GENX(tu_CmdEndQuery);
1585 
1586 template <chip CHIP>
1587 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1588 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
1589                          VkQueryPool queryPool,
1590                          uint32_t query,
1591                          uint32_t index)
1592 {
1593    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1594    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1595    assert(query < pool->size);
1596 
1597    switch (pool->type) {
1598    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1599       assert(index <= 4);
1600       emit_end_xfb_query<CHIP>(cmdbuf, pool, query, index);
1601       break;
1602    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1603       emit_end_prim_generated_query<CHIP>(cmdbuf, pool, query);
1604       break;
1605    default:
1606       assert(!"Invalid query type");
1607    }
1608 }
1609 TU_GENX(tu_CmdEndQueryIndexedEXT);
1610 
1611 VKAPI_ATTR void VKAPI_CALL
tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits2 pipelineStage,VkQueryPool queryPool,uint32_t query)1612 tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
1613                       VkPipelineStageFlagBits2 pipelineStage,
1614                       VkQueryPool queryPool,
1615                       uint32_t query)
1616 {
1617    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1618    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1619 
1620    /* Inside a render pass, just write the timestamp multiple times so that
1621     * the user gets the last one if we use GMEM. There isn't really much
1622     * better we can do, and this seems to be what the blob does too.
1623     */
1624    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
1625 
1626    /* Stages that will already have been executed by the time the CP executes
1627     * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1628     * indirect stage counts as top-of-pipe too.
1629     */
1630    VkPipelineStageFlags2 top_of_pipe_flags =
1631       VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1632       VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
1633 
1634    if (pipelineStage & ~top_of_pipe_flags) {
1635       /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1636        * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1637        * complete.
1638        *
1639        * Stalling the CP like this is really unfortunate, but I don't think
1640        * there's a better solution that allows all 48 bits of precision
1641        * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1642        */
1643       tu_cs_emit_wfi(cs);
1644    }
1645 
1646    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1647    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) |
1648                   CP_REG_TO_MEM_0_CNT(2) |
1649                   CP_REG_TO_MEM_0_64B);
1650    tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1651 
1652    /* Only flag availability once the entire renderpass is done, similar to
1653     * the begin/end path.
1654     */
1655    cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
1656 
1657    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1658    tu_cs_emit_qw(cs, query_available_iova(pool, query));
1659    tu_cs_emit_qw(cs, 0x1);
1660 
1661    /* From the spec for vkCmdWriteTimestamp:
1662     *
1663     *    If vkCmdWriteTimestamp is called while executing a render pass
1664     *    instance that has multiview enabled, the timestamp uses N consecutive
1665     *    query indices in the query pool (starting at query) where N is the
1666     *    number of bits set in the view mask of the subpass the command is
1667     *    executed in. The resulting query values are determined by an
1668     *    implementation-dependent choice of one of the following behaviors:
1669     *
1670     *    -   The first query is a timestamp value and (if more than one bit is
1671     *        set in the view mask) zero is written to the remaining queries.
1672     *        If two timestamps are written in the same subpass, the sum of the
1673     *        execution time of all views between those commands is the
1674     *        difference between the first query written by each command.
1675     *
1676     *    -   All N queries are timestamp values. If two timestamps are written
1677     *        in the same subpass, the sum of the execution time of all views
1678     *        between those commands is the sum of the difference between
1679     *        corresponding queries written by each command. The difference
1680     *        between corresponding queries may be the execution time of a
1681     *        single view.
1682     *
1683     * We execute all views in the same draw call, so we implement the first
1684     * option, the same as regular queries.
1685     */
1686    handle_multiview_queries(cmd, pool, query);
1687 }
1688 
1689 VKAPI_ATTR VkResult VKAPI_CALL
tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1690 tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1691     VkPhysicalDevice                            physicalDevice,
1692     uint32_t                                    queueFamilyIndex,
1693     uint32_t*                                   pCounterCount,
1694     VkPerformanceCounterKHR*                    pCounters,
1695     VkPerformanceCounterDescriptionKHR*         pCounterDescriptions)
1696 {
1697    TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1698 
1699    uint32_t desc_count = *pCounterCount;
1700    uint32_t group_count;
1701    const struct fd_perfcntr_group *group =
1702          fd_perfcntrs(&phydev->dev_id, &group_count);
1703 
1704    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
1705    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
1706                           pCounterDescriptions, &desc_count);
1707 
1708    for (int i = 0; i < group_count; i++) {
1709       for (int j = 0; j < group[i].num_countables; j++) {
1710 
1711          vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1712             counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
1713             counter->unit =
1714                   fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
1715             counter->storage =
1716                   fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
1717 
1718             unsigned char sha1_result[20];
1719             _mesa_sha1_compute(group[i].countables[j].name,
1720                                strlen(group[i].countables[j].name),
1721                                sha1_result);
1722             memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1723          }
1724 
1725          vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
1726             desc->flags = 0;
1727 
1728             snprintf(desc->name, sizeof(desc->name),
1729                      "%s", group[i].countables[j].name);
1730             snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
1731             snprintf(desc->description, sizeof(desc->description),
1732                      "%s: %s performance counter",
1733                      group[i].name, group[i].countables[j].name);
1734          }
1735       }
1736    }
1737 
1738    return vk_outarray_status(&out);
1739 }
1740 
1741 VKAPI_ATTR void VKAPI_CALL
tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1742 tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1743       VkPhysicalDevice                            physicalDevice,
1744       const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
1745       uint32_t*                                   pNumPasses)
1746 {
1747    TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1748    uint32_t group_count = 0;
1749    uint32_t gid = 0, cid = 0, n_passes;
1750    const struct fd_perfcntr_group *group =
1751          fd_perfcntrs(&phydev->dev_id, &group_count);
1752 
1753    uint32_t counters_requested[group_count];
1754    memset(counters_requested, 0x0, sizeof(counters_requested));
1755    *pNumPasses = 1;
1756 
1757    for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
1758       perfcntr_index(group, group_count,
1759                      pPerformanceQueryCreateInfo->pCounterIndices[i],
1760                      &gid, &cid);
1761 
1762       counters_requested[gid]++;
1763    }
1764 
1765    for (uint32_t i = 0; i < group_count; i++) {
1766       n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
1767       *pNumPasses = MAX2(*pNumPasses, n_passes);
1768    }
1769 }
1770 
1771 VKAPI_ATTR VkResult VKAPI_CALL
tu_AcquireProfilingLockKHR(VkDevice device,const VkAcquireProfilingLockInfoKHR * pInfo)1772 tu_AcquireProfilingLockKHR(VkDevice device,
1773                            const VkAcquireProfilingLockInfoKHR* pInfo)
1774 {
1775    /* TODO. Probably there's something to do for kgsl. */
1776    return VK_SUCCESS;
1777 }
1778 
1779 VKAPI_ATTR void VKAPI_CALL
tu_ReleaseProfilingLockKHR(VkDevice device)1780 tu_ReleaseProfilingLockKHR(VkDevice device)
1781 {
1782    /* TODO. Probably there's something to do for kgsl. */
1783    return;
1784 }
1785