• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  *
5  * Based on anv:
6  * Copyright © 2015 Intel Corporation
7  */
8 
9 #include "tu_query_pool.h"
10 
11 #include <fcntl.h>
12 
13 #include "nir/nir_builder.h"
14 #include "util/os_time.h"
15 
16 #include "vk_util.h"
17 
18 #include "tu_buffer.h"
19 #include "tu_cmd_buffer.h"
20 #include "tu_cs.h"
21 #include "tu_device.h"
22 #include "tu_rmv.h"
23 
24 #include "common/freedreno_gpu_event.h"
25 
26 #define NSEC_PER_SEC 1000000000ull
27 #define WAIT_TIMEOUT 5
28 #define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
29 
30 struct PACKED query_slot {
31    uint64_t available;
32 };
33 
34 struct PACKED occlusion_query_slot {
35    struct query_slot common;
36    uint64_t _padding0;
37 
38    uint64_t begin;
39    uint64_t result;
40    uint64_t end;
41    uint64_t _padding1;
42 };
43 
44 struct PACKED timestamp_query_slot {
45    struct query_slot common;
46    uint64_t result;
47 };
48 
49 struct PACKED primitive_slot_value {
50    uint64_t values[2];
51 };
52 
53 struct PACKED pipeline_stat_query_slot {
54    struct query_slot common;
55    uint64_t results[STAT_COUNT];
56 
57    uint64_t begin[STAT_COUNT];
58    uint64_t end[STAT_COUNT];
59 };
60 
61 struct PACKED primitive_query_slot {
62    struct query_slot common;
63    /* The result of transform feedback queries is two integer values:
64     *   results[0] is the count of primitives written,
65     *   results[1] is the count of primitives generated.
66     * Also a result for each stream is stored at 4 slots respectively.
67     */
68    uint64_t results[2];
69 
70    /* Primitive counters also need to be 16-byte aligned. */
71    uint64_t _padding;
72 
73    struct primitive_slot_value begin[4];
74    struct primitive_slot_value end[4];
75 };
76 
77 struct PACKED perfcntr_query_slot {
78    uint64_t result;
79    uint64_t begin;
80    uint64_t end;
81 };
82 
83 struct PACKED perf_query_slot {
84    struct query_slot common;
85    struct perfcntr_query_slot perfcntr;
86 };
87 
88 struct PACKED primitives_generated_query_slot {
89    struct query_slot common;
90    uint64_t result;
91    uint64_t begin;
92    uint64_t end;
93 };
94 
95 /* Returns the IOVA or mapped address of a given uint64_t field
96  * in a given slot of a query pool. */
97 #define query_iova(type, pool, query, field)                               \
98    pool->bo->iova + pool->query_stride * (query) + offsetof(type, field)
99 #define query_addr(type, pool, query, field)                               \
100    (uint64_t *) ((char *) pool->bo->map + pool->query_stride * (query) +   \
101                  offsetof(type, field))
102 
103 #define occlusion_query_iova(pool, query, field)                           \
104    query_iova(struct occlusion_query_slot, pool, query, field)
105 #define occlusion_query_addr(pool, query, field)                           \
106    query_addr(struct occlusion_query_slot, pool, query, field)
107 
108 #define pipeline_stat_query_iova(pool, query, field, idx)                  \
109    pool->bo->iova + pool->query_stride * (query) +                         \
110       offsetof_arr(struct pipeline_stat_query_slot, field, (idx))
111 
112 #define primitive_query_iova(pool, query, field, stream_id, i)             \
113    query_iova(struct primitive_query_slot, pool, query, field) +           \
114       sizeof_field(struct primitive_query_slot, field[0]) * (stream_id) +  \
115       offsetof_arr(struct primitive_slot_value, values, (i))
116 
117 #define perf_query_iova(pool, query, field, i)                             \
118    pool->bo->iova + pool->query_stride * (query) +                         \
119    sizeof(struct query_slot) +                                             \
120    sizeof(struct perfcntr_query_slot) * (i) +                              \
121    offsetof(struct perfcntr_query_slot, field)
122 
123 #define primitives_generated_query_iova(pool, query, field)                \
124    query_iova(struct primitives_generated_query_slot, pool, query, field)
125 
126 #define query_available_iova(pool, query)                                  \
127    query_iova(struct query_slot, pool, query, available)
128 
129 #define query_result_iova(pool, query, type, i)                            \
130    pool->bo->iova + pool->query_stride * (query) +                         \
131    sizeof(struct query_slot) + sizeof(type) * (i)
132 
133 #define query_result_addr(pool, query, type, i)                            \
134    (uint64_t *) ((char *) pool->bo->map + pool->query_stride * (query) +   \
135                  sizeof(struct query_slot) + sizeof(type) * (i))
136 
137 #define query_is_available(slot) slot->available
138 
139 static const VkPerformanceCounterUnitKHR
140 fd_perfcntr_type_to_vk_unit[] = {
141    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
142    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
143    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
144    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
145    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
146    /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
147    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
148    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
149    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
150    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
151    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
152    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
153    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
154 };
155 
156 /* TODO. Basically this comes from the freedreno implementation where
157  * only UINT64 is used. We'd better confirm this by the blob vulkan driver
158  * when it starts supporting perf query.
159  */
160 static const VkPerformanceCounterStorageKHR
161 fd_perfcntr_type_to_vk_storage[] = {
162    [FD_PERFCNTR_TYPE_UINT64]       = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
163    [FD_PERFCNTR_TYPE_UINT]         = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
164    [FD_PERFCNTR_TYPE_FLOAT]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
165    [FD_PERFCNTR_TYPE_PERCENTAGE]   = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
166    [FD_PERFCNTR_TYPE_BYTES]        = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
167    [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
168    [FD_PERFCNTR_TYPE_HZ]           = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
169    [FD_PERFCNTR_TYPE_DBM]          = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
170    [FD_PERFCNTR_TYPE_TEMPERATURE]  = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
171    [FD_PERFCNTR_TYPE_VOLTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
172    [FD_PERFCNTR_TYPE_AMPS]         = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
173    [FD_PERFCNTR_TYPE_WATTS]        = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
174 };
175 
176 /*
177  * Returns a pointer to a given slot in a query pool.
178  */
179 static struct query_slot *
slot_address(struct tu_query_pool * pool,uint32_t query)180 slot_address(struct tu_query_pool *pool, uint32_t query)
181 {
182    return (struct query_slot *) ((char *) pool->bo->map +
183                                  query * pool->query_stride);
184 }
185 
186 static void
perfcntr_index(const struct fd_perfcntr_group * group,uint32_t group_count,uint32_t index,uint32_t * gid,uint32_t * cid)187 perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
188                uint32_t index, uint32_t *gid, uint32_t *cid)
189 
190 {
191    uint32_t i;
192 
193    for (i = 0; i < group_count; i++) {
194       if (group[i].num_countables > index) {
195          *gid = i;
196          *cid = index;
197          break;
198       }
199       index -= group[i].num_countables;
200    }
201 
202    assert(i < group_count);
203 }
204 
205 static int
compare_perfcntr_pass(const void * a,const void * b)206 compare_perfcntr_pass(const void *a, const void *b)
207 {
208    return ((struct tu_perf_query_data *)a)->pass -
209           ((struct tu_perf_query_data *)b)->pass;
210 }
211 
212 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)213 tu_CreateQueryPool(VkDevice _device,
214                    const VkQueryPoolCreateInfo *pCreateInfo,
215                    const VkAllocationCallbacks *pAllocator,
216                    VkQueryPool *pQueryPool)
217 {
218    VK_FROM_HANDLE(tu_device, device, _device);
219    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
220    assert(pCreateInfo->queryCount > 0);
221 
222    uint32_t pool_size, slot_size;
223    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
224 
225    pool_size = sizeof(struct tu_query_pool);
226 
227    switch (pCreateInfo->queryType) {
228    case VK_QUERY_TYPE_OCCLUSION:
229       slot_size = sizeof(struct occlusion_query_slot);
230       break;
231    case VK_QUERY_TYPE_TIMESTAMP:
232       slot_size = sizeof(struct timestamp_query_slot);
233       break;
234    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
235       slot_size = sizeof(struct primitive_query_slot);
236       break;
237    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
238       slot_size = sizeof(struct primitives_generated_query_slot);
239       break;
240    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
241       perf_query_info =
242             vk_find_struct_const(pCreateInfo->pNext,
243                                  QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
244       assert(perf_query_info);
245 
246       slot_size = sizeof(struct perf_query_slot) +
247                   sizeof(struct perfcntr_query_slot) *
248                   (perf_query_info->counterIndexCount - 1);
249 
250       /* Size of the array pool->tu_perf_query_data */
251       pool_size += sizeof(struct tu_perf_query_data) *
252                    perf_query_info->counterIndexCount;
253       break;
254    }
255    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
256       slot_size = sizeof(struct pipeline_stat_query_slot);
257       break;
258    default:
259       unreachable("Invalid query type");
260    }
261 
262    struct tu_query_pool *pool = (struct tu_query_pool *)
263          vk_query_pool_create(&device->vk, pCreateInfo,
264                               pAllocator, pool_size);
265    if (!pool)
266       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
267 
268    if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
269       pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
270                                       &pool->perf_group_count);
271 
272       pool->counter_index_count = perf_query_info->counterIndexCount;
273 
274       /* Build all perf counters data that is requested, so we could get
275        * correct group id, countable id, counter register and pass index with
276        * only a counter index provided by applications at each command submit.
277        *
278        * Also, since this built data will be sorted by pass index later, we
279        * should keep the original indices and store perfcntrs results according
280        * to them so apps can get correct results with their own indices.
281        */
282       uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
283       memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
284       memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
285 
286       for (uint32_t i = 0; i < pool->counter_index_count; i++) {
287          uint32_t gid = 0, cid = 0;
288 
289          perfcntr_index(pool->perf_group, pool->perf_group_count,
290                         perf_query_info->pCounterIndices[i], &gid, &cid);
291 
292          pool->perf_query_data[i].gid = gid;
293          pool->perf_query_data[i].cid = cid;
294          pool->perf_query_data[i].app_idx = i;
295 
296          /* When a counter register is over the capacity(num_counters),
297           * reset it for next pass.
298           */
299          if (regs[gid] < pool->perf_group[gid].num_counters) {
300             pool->perf_query_data[i].cntr_reg = regs[gid]++;
301             pool->perf_query_data[i].pass = pass[gid];
302          } else {
303             pool->perf_query_data[i].pass = ++pass[gid];
304             pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
305             regs[gid]++;
306          }
307       }
308 
309       /* Sort by pass index so we could easily prepare a command stream
310        * with the ascending order of pass index.
311        */
312       qsort(pool->perf_query_data, pool->counter_index_count,
313             sizeof(pool->perf_query_data[0]),
314             compare_perfcntr_pass);
315    }
316 
317    VkResult result = tu_bo_init_new(device, &pool->vk.base, &pool->bo,
318          pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
319    if (result != VK_SUCCESS) {
320       vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
321       return result;
322    }
323 
324    result = tu_bo_map(device, pool->bo, NULL);
325    if (result != VK_SUCCESS) {
326       tu_bo_finish(device, pool->bo);
327       vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
328       return result;
329    }
330 
331    /* Initialize all query statuses to unavailable */
332    memset(pool->bo->map, 0, pool->bo->size);
333 
334    pool->size = pCreateInfo->queryCount;
335    pool->query_stride = slot_size;
336 
337    TU_RMV(query_pool_create, device, pool);
338 
339    *pQueryPool = tu_query_pool_to_handle(pool);
340 
341    return VK_SUCCESS;
342 }
343 
344 VKAPI_ATTR void VKAPI_CALL
tu_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)345 tu_DestroyQueryPool(VkDevice _device,
346                     VkQueryPool _pool,
347                     const VkAllocationCallbacks *pAllocator)
348 {
349    VK_FROM_HANDLE(tu_device, device, _device);
350    VK_FROM_HANDLE(tu_query_pool, pool, _pool);
351 
352    if (!pool)
353       return;
354 
355    TU_RMV(resource_destroy, device, pool);
356 
357    tu_bo_finish(device, pool->bo);
358    vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
359 }
360 
361 static uint32_t
get_result_count(struct tu_query_pool * pool)362 get_result_count(struct tu_query_pool *pool)
363 {
364    switch (pool->vk.query_type) {
365    /* Occulusion and timestamp queries write one integer value */
366    case VK_QUERY_TYPE_OCCLUSION:
367    case VK_QUERY_TYPE_TIMESTAMP:
368    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
369       return 1;
370    /* Transform feedback queries write two integer values */
371    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
372       return 2;
373    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
374       return util_bitcount(pool->vk.pipeline_statistics);
375    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
376       return pool->counter_index_count;
377    default:
378       assert(!"Invalid query type");
379       return 0;
380    }
381 }
382 
383 static uint32_t
statistics_index(uint32_t * statistics)384 statistics_index(uint32_t *statistics)
385 {
386    uint32_t stat;
387    stat = u_bit_scan(statistics);
388 
389    switch (1 << stat) {
390    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
391       return 0;
392    case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
393       return 1;
394    case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
395       return 2;
396    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
397       return 5;
398    case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
399       return 6;
400    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
401       return 7;
402    case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
403       return 8;
404    case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
405       return 9;
406    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
407       return 3;
408    case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
409       return 4;
410    case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
411       return 10;
412    default:
413       return 0;
414    }
415 }
416 
417 static bool
is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)418 is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
419 {
420    return pipeline_statistics &
421           (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
422            VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
423            VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
424            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
425            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
426            VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
427            VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
428            VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
429            VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT);
430 }
431 
432 static bool
is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)433 is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
434 {
435    return pipeline_statistics &
436           VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
437 }
438 
439 static bool
is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)440 is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
441 {
442    return pipeline_statistics &
443           VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
444 }
445 
446 /* Wait on the the availability status of a query up until a timeout. */
447 static VkResult
wait_for_available(struct tu_device * device,struct tu_query_pool * pool,uint32_t query)448 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
449                    uint32_t query)
450 {
451    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
452     * scheduler friendly way instead of busy polling once the patch has landed
453     * upstream. */
454    struct query_slot *slot = slot_address(pool, query);
455    uint64_t abs_timeout = os_time_get_absolute_timeout(
456          WAIT_TIMEOUT * NSEC_PER_SEC);
457    while(os_time_get_nano() < abs_timeout) {
458       if (query_is_available(slot))
459          return VK_SUCCESS;
460    }
461    return vk_error(device, VK_TIMEOUT);
462 }
463 
464 /* Writes a query value to a buffer from the CPU. */
465 static void
write_query_value_cpu(char * base,uint32_t offset,uint64_t value,VkQueryResultFlags flags)466 write_query_value_cpu(char* base,
467                       uint32_t offset,
468                       uint64_t value,
469                       VkQueryResultFlags flags)
470 {
471    if (flags & VK_QUERY_RESULT_64_BIT) {
472       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
473    } else {
474       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
475    }
476 }
477 
478 static VkResult
get_query_pool_results(struct tu_device * device,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)479 get_query_pool_results(struct tu_device *device,
480                        struct tu_query_pool *pool,
481                        uint32_t firstQuery,
482                        uint32_t queryCount,
483                        size_t dataSize,
484                        void *pData,
485                        VkDeviceSize stride,
486                        VkQueryResultFlags flags)
487 {
488    assert(dataSize >= stride * queryCount);
489 
490    char *result_base = (char *) pData;
491    VkResult result = VK_SUCCESS;
492    for (uint32_t i = 0; i < queryCount; i++) {
493       uint32_t query = firstQuery + i;
494       struct query_slot *slot = slot_address(pool, query);
495       bool available = query_is_available(slot);
496       uint32_t result_count = get_result_count(pool);
497       uint32_t statistics = pool->vk.pipeline_statistics;
498 
499       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
500          VkResult wait_result = wait_for_available(device, pool, query);
501          if (wait_result != VK_SUCCESS)
502             return wait_result;
503          available = true;
504       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
505          /* From the Vulkan 1.1.130 spec:
506           *
507           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
508           *    both not set then no result values are written to pData for
509           *    queries that are in the unavailable state at the time of the
510           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
511           *    availability state is still written to pData for those queries
512           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
513           */
514          result = VK_NOT_READY;
515          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
516             result_base += stride;
517             continue;
518          }
519       }
520 
521       for (uint32_t k = 0; k < result_count; k++) {
522          if (available) {
523             uint64_t *result;
524 
525             if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
526                uint32_t stat_idx = statistics_index(&statistics);
527                result = query_result_addr(pool, query, uint64_t, stat_idx);
528             } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
529                result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
530             } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
531                assert(k == 0);
532                result = occlusion_query_addr(pool, query, result);
533             } else {
534                result = query_result_addr(pool, query, uint64_t, k);
535             }
536 
537             write_query_value_cpu(result_base, k, *result, flags);
538          } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
539              /* From the Vulkan 1.1.130 spec:
540               *
541               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
542               *   is not set, and the query’s status is unavailable, an
543               *   intermediate result value between zero and the final result
544               *   value is written to pData for that query.
545               *
546               * Just return 0 here for simplicity since it's a valid result.
547               */
548             write_query_value_cpu(result_base, k, 0, flags);
549       }
550 
551       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
552          /* From the Vulkan 1.1.130 spec:
553           *
554           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
555           *    integer value written for each query is non-zero if the query’s
556           *    status was available or zero if the status was unavailable.
557           */
558          write_query_value_cpu(result_base, result_count, available, flags);
559 
560       result_base += stride;
561    }
562    return result;
563 }
564 
565 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)566 tu_GetQueryPoolResults(VkDevice _device,
567                        VkQueryPool queryPool,
568                        uint32_t firstQuery,
569                        uint32_t queryCount,
570                        size_t dataSize,
571                        void *pData,
572                        VkDeviceSize stride,
573                        VkQueryResultFlags flags)
574 {
575    VK_FROM_HANDLE(tu_device, device, _device);
576    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
577    assert(firstQuery + queryCount <= pool->size);
578 
579    if (vk_device_is_lost(&device->vk))
580       return VK_ERROR_DEVICE_LOST;
581 
582    switch (pool->vk.query_type) {
583    case VK_QUERY_TYPE_OCCLUSION:
584    case VK_QUERY_TYPE_TIMESTAMP:
585    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
586    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
587    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
588    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
589       return get_query_pool_results(device, pool, firstQuery, queryCount,
590                                     dataSize, pData, stride, flags);
591    default:
592       assert(!"Invalid query type");
593    }
594    return VK_SUCCESS;
595 }
596 
597 /* Copies a query value from one buffer to another from the GPU. */
598 static void
copy_query_value_gpu(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,uint64_t src_iova,uint64_t base_write_iova,uint32_t offset,VkQueryResultFlags flags)599 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
600                      struct tu_cs *cs,
601                      uint64_t src_iova,
602                      uint64_t base_write_iova,
603                      uint32_t offset,
604                      VkQueryResultFlags flags) {
605    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
606          sizeof(uint64_t) : sizeof(uint32_t);
607    uint64_t write_iova = base_write_iova + (offset * element_size);
608 
609    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
610    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
611          CP_MEM_TO_MEM_0_DOUBLE : 0;
612    tu_cs_emit(cs, mem_to_mem_flags);
613    tu_cs_emit_qw(cs, write_iova);
614    tu_cs_emit_qw(cs, src_iova);
615 }
616 
617 template <chip CHIP>
618 static void
emit_copy_query_pool_results(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,struct tu_buffer * buffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)619 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
620                              struct tu_cs *cs,
621                              struct tu_query_pool *pool,
622                              uint32_t firstQuery,
623                              uint32_t queryCount,
624                              struct tu_buffer *buffer,
625                              VkDeviceSize dstOffset,
626                              VkDeviceSize stride,
627                              VkQueryResultFlags flags)
628 {
629    /* Flush cache for the buffer to copy to. */
630    tu_emit_cache_flush<CHIP>(cmdbuf);
631 
632    /* From the Vulkan 1.1.130 spec:
633     *
634     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
635     *    uses of vkCmdResetQueryPool in the same queue, without any additional
636     *    synchronization.
637     *
638     * To ensure that previous writes to the available bit are coherent, first
639     * wait for all writes to complete.
640     */
641    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
642 
643    for (uint32_t i = 0; i < queryCount; i++) {
644       uint32_t query = firstQuery + i;
645       uint64_t available_iova = query_available_iova(pool, query);
646       uint64_t buffer_iova = buffer->iova + dstOffset + i * stride;
647       uint32_t result_count = get_result_count(pool);
648       uint32_t statistics = pool->vk.pipeline_statistics;
649 
650       /* Wait for the available bit to be set if executed with the
651        * VK_QUERY_RESULT_WAIT_BIT flag. */
652       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
653          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
654          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
655                         CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
656          tu_cs_emit_qw(cs, available_iova);
657          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
658          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
659          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
660       }
661 
662       for (uint32_t k = 0; k < result_count; k++) {
663          uint64_t result_iova;
664 
665          if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
666             uint32_t stat_idx = statistics_index(&statistics);
667             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
668          } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
669             result_iova = query_result_iova(pool, query,
670                                             struct perfcntr_query_slot, k);
671          } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
672             assert(k == 0);
673             result_iova = occlusion_query_iova(pool, query, result);
674          } else {
675             result_iova = query_result_iova(pool, query, uint64_t, k);
676          }
677 
678          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
679             /* Unconditionally copying the bo->result into the buffer here is
680              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
681              * if the query is unavailable, this will copy the correct partial
682              * value of 0.
683              */
684             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
685                                  k /* offset */, flags);
686          } else {
687             /* Conditionally copy bo->result into the buffer based on whether the
688              * query is available.
689              *
690              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
691              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
692              * that 0 < available < 2, aka available == 1.
693              */
694             tu_cs_reserve(cs, 7 + 6);
695             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
696             tu_cs_emit_qw(cs, available_iova);
697             tu_cs_emit_qw(cs, available_iova);
698             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
699             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
700 
701             /* Start of conditional execution */
702             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
703                               k /* offset */, flags);
704             /* End of conditional execution */
705          }
706       }
707 
708       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
709          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
710                               result_count /* offset */, flags);
711       }
712    }
713 }
714 
715 template <chip CHIP>
716 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)717 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
718                            VkQueryPool queryPool,
719                            uint32_t firstQuery,
720                            uint32_t queryCount,
721                            VkBuffer dstBuffer,
722                            VkDeviceSize dstOffset,
723                            VkDeviceSize stride,
724                            VkQueryResultFlags flags)
725 {
726    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
727    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
728    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
729    struct tu_cs *cs = &cmdbuf->cs;
730    assert(firstQuery + queryCount <= pool->size);
731 
732    switch (pool->vk.query_type) {
733    case VK_QUERY_TYPE_OCCLUSION:
734    case VK_QUERY_TYPE_TIMESTAMP:
735    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
736    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
737    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
738       return emit_copy_query_pool_results<CHIP>(cmdbuf, cs, pool, firstQuery,
739                                                 queryCount, buffer, dstOffset,
740                                                 stride, flags);
741    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
742       unreachable("allowCommandBufferQueryCopies is false");
743    default:
744       assert(!"Invalid query type");
745    }
746 }
747 TU_GENX(tu_CmdCopyQueryPoolResults);
748 
749 static void
emit_reset_query_pool(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount)750 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
751                       struct tu_query_pool *pool,
752                       uint32_t firstQuery,
753                       uint32_t queryCount)
754 {
755    struct tu_cs *cs = &cmdbuf->cs;
756 
757    for (uint32_t i = 0; i < queryCount; i++) {
758       uint32_t query = firstQuery + i;
759       uint32_t statistics = pool->vk.pipeline_statistics;
760 
761       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
762       tu_cs_emit_qw(cs, query_available_iova(pool, query));
763       tu_cs_emit_qw(cs, 0x0);
764 
765       for (uint32_t k = 0; k < get_result_count(pool); k++) {
766          uint64_t result_iova;
767 
768          if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
769             uint32_t stat_idx = statistics_index(&statistics);
770             result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
771          } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
772             result_iova = query_result_iova(pool, query,
773                                             struct perfcntr_query_slot, k);
774          } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
775             assert(k == 0);
776             result_iova = occlusion_query_iova(pool, query, result);
777          } else {
778             result_iova = query_result_iova(pool, query, uint64_t, k);
779          }
780 
781          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
782          tu_cs_emit_qw(cs, result_iova);
783          tu_cs_emit_qw(cs, 0x0);
784       }
785    }
786 
787 }
788 
789 VKAPI_ATTR void VKAPI_CALL
tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)790 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
791                      VkQueryPool queryPool,
792                      uint32_t firstQuery,
793                      uint32_t queryCount)
794 {
795    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
796    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
797 
798    switch (pool->vk.query_type) {
799    case VK_QUERY_TYPE_TIMESTAMP:
800    case VK_QUERY_TYPE_OCCLUSION:
801    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
802    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
803    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
804    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
805       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
806       break;
807    default:
808       assert(!"Invalid query type");
809    }
810 }
811 
812 VKAPI_ATTR void VKAPI_CALL
tu_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)813 tu_ResetQueryPool(VkDevice device,
814                   VkQueryPool queryPool,
815                   uint32_t firstQuery,
816                   uint32_t queryCount)
817 {
818    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
819 
820    for (uint32_t i = 0; i < queryCount; i++) {
821       struct query_slot *slot = slot_address(pool, i + firstQuery);
822       slot->available = 0;
823 
824       for (uint32_t k = 0; k < get_result_count(pool); k++) {
825          uint64_t *res;
826 
827          if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
828             res = query_result_addr(pool, i + firstQuery,
829                                     struct perfcntr_query_slot, k);
830          } else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
831             assert(k == 0);
832             res = occlusion_query_addr(pool, i + firstQuery, result);
833          } else {
834             res = query_result_addr(pool, i + firstQuery, uint64_t, k);
835          }
836 
837          *res = 0;
838       }
839    }
840 }
841 
842 template <chip CHIP>
843 static void
emit_begin_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)844 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
845                            struct tu_query_pool *pool,
846                            uint32_t query)
847 {
848    /* From the Vulkan 1.1.130 spec:
849     *
850     *    A query must begin and end inside the same subpass of a render pass
851     *    instance, or must both begin and end outside of a render pass
852     *    instance.
853     *
854     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
855     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
856     * query begins/ends inside the same subpass of a render pass, we need to
857     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
858     * is then run on every tile during render, so we just need to accumulate
859     * sample counts in slot->result to compute the query result.
860     */
861    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
862 
863    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
864 
865    tu_cs_emit_regs(cs,
866                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
867 
868    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
869       tu_cs_emit_regs(cs,
870                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
871       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
872       tu_cs_emit(cs, ZPASS_DONE);
873       if (CHIP == A7XX) {
874          /* Copied from blob's cmdstream, not sure why it is done. */
875          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
876          tu_cs_emit(cs, CCU_CLEAN_DEPTH);
877       }
878    } else {
879       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
880       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
881                                        .write_sample_count = true).value);
882       tu_cs_emit_qw(cs, begin_iova);
883 
884       /* ZPASS_DONE events should come in begin-end pairs. When emitting and
885        * occlusion query outside of a renderpass, we emit a fake end event that
886        * closes the previous one since the autotuner's ZPASS_DONE use could end
887        * up causing problems. This events writes into the end field of the query
888        * slot, but it will be overwritten by events in emit_end_occlusion_query
889        * with the proper value.
890        * When inside a renderpass, the corresponding ZPASS_DONE event will be
891        * emitted in emit_end_occlusion_query. We note the use of ZPASS_DONE on
892        * the state object, enabling autotuner to optimize its own events.
893        */
894       if (!cmdbuf->state.pass) {
895          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
896          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
897                                           .write_sample_count = true,
898                                           .sample_count_end_offset = true,
899                                           .write_accum_sample_count_diff = true).value);
900          tu_cs_emit_qw(cs, begin_iova);
901       } else {
902          cmdbuf->state.rp.has_zpass_done_sample_count_write_in_rp = true;
903       }
904    }
905 }
906 
907 template <chip CHIP>
908 static void
emit_begin_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)909 emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
910                       struct tu_query_pool *pool,
911                       uint32_t query)
912 {
913    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
914    uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin, 0);
915 
916    if (is_pipeline_query_with_vertex_stage(pool->vk.pipeline_statistics)) {
917       bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
918       cmdbuf->state.prim_counters_running++;
919 
920       /* Prevent starting primitive counters when it is supposed to be stopped
921        * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
922        */
923       if (need_cond_exec) {
924          tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
925                         CP_COND_REG_EXEC_0_SYSMEM |
926                         CP_COND_REG_EXEC_0_BINNING);
927       }
928 
929       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
930 
931       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
932       tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
933       tu_cs_emit(cs, 0);
934 
935       if (need_cond_exec) {
936          tu_cond_exec_end(cs);
937       }
938    }
939 
940    if (is_pipeline_query_with_fragment_stage(pool->vk.pipeline_statistics)) {
941       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_FRAGMENT_CTRS);
942    }
943 
944    if (is_pipeline_query_with_compute_stage(pool->vk.pipeline_statistics)) {
945       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_COMPUTE_CTRS);
946    }
947 
948    tu_cs_emit_wfi(cs);
949 
950    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
951    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
952                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
953                   CP_REG_TO_MEM_0_64B);
954    tu_cs_emit_qw(cs, begin_iova);
955 }
956 
957 static void
emit_perfcntrs_pass_start(struct tu_cs * cs,uint32_t pass)958 emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
959 {
960    tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
961    tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
962                         REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
963                   A6XX_CP_REG_TEST_0_BIT(pass) |
964                   A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
965    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
966 }
967 
968 static void
emit_begin_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)969 emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
970                            struct tu_query_pool *pool,
971                            uint32_t query)
972 {
973    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
974    uint32_t last_pass = ~0;
975 
976    if (cmdbuf->state.pass) {
977       cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
978    }
979 
980    /* Querying perf counters happens in these steps:
981     *
982     *  0) There's a scratch reg to set a pass index for perf counters query.
983     *     Prepare cmd streams to set each pass index to the reg at device
984     *     creation time. See tu_CreateDevice in tu_device.c
985     *  1) Emit command streams to read all requested perf counters at all
986     *     passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
987     *     reads the scratch reg where pass index is set.
988     *     See emit_perfcntrs_pass_start.
989     *  2) Pick the right cs setting proper pass index to the reg and prepend
990     *     it to the command buffer at each submit time.
991     *     See tu_queue_build_msm_gem_submit_cmds in tu_knl_drm_msm.cc and
992     *     tu_knl_drm_virtio.cc and kgsl_queue_submit in tu_knl_kgsl.cc
993     *  3) If the pass index in the reg is true, then executes the command
994     *     stream below CP_COND_REG_EXEC.
995     */
996 
997    tu_cs_emit_wfi(cs);
998 
999    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1000       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1001 
1002       if (last_pass != data->pass) {
1003          last_pass = data->pass;
1004 
1005          if (data->pass != 0)
1006             tu_cond_exec_end(cs);
1007          emit_perfcntrs_pass_start(cs, data->pass);
1008       }
1009 
1010       const struct fd_perfcntr_counter *counter =
1011             &pool->perf_group[data->gid].counters[data->cntr_reg];
1012       const struct fd_perfcntr_countable *countable =
1013             &pool->perf_group[data->gid].countables[data->cid];
1014 
1015       tu_cs_emit_pkt4(cs, counter->select_reg, 1);
1016       tu_cs_emit(cs, countable->selector);
1017    }
1018    tu_cond_exec_end(cs);
1019 
1020    last_pass = ~0;
1021    tu_cs_emit_wfi(cs);
1022 
1023    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1024       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1025 
1026       if (last_pass != data->pass) {
1027          last_pass = data->pass;
1028 
1029          if (data->pass != 0)
1030             tu_cond_exec_end(cs);
1031          emit_perfcntrs_pass_start(cs, data->pass);
1032       }
1033 
1034       const struct fd_perfcntr_counter *counter =
1035             &pool->perf_group[data->gid].counters[data->cntr_reg];
1036 
1037       uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1038 
1039       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1040       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1041                      CP_REG_TO_MEM_0_64B);
1042       tu_cs_emit_qw(cs, begin_iova);
1043    }
1044    tu_cond_exec_end(cs);
1045 }
1046 
1047 template <chip CHIP>
1048 static void
emit_begin_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1049 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
1050                      struct tu_query_pool *pool,
1051                      uint32_t query,
1052                      uint32_t stream_id)
1053 {
1054    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1055    uint64_t begin_iova = primitive_query_iova(pool, query, begin, 0, 0);
1056 
1057    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
1058    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1059 }
1060 
1061 template <chip CHIP>
1062 static void
emit_begin_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1063 emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1064                                 struct tu_query_pool *pool,
1065                                 uint32_t query)
1066 {
1067    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1068    uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1069 
1070    if (cmdbuf->state.pass) {
1071       cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
1072    } else {
1073       cmdbuf->state.prim_generated_query_running_before_rp = true;
1074    }
1075 
1076    cmdbuf->state.prim_counters_running++;
1077 
1078    if (cmdbuf->state.pass) {
1079       /* Primitives that passed all tests are still counted in in each
1080        * tile even with HW binning beforehand. Do not permit it.
1081        */
1082       tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1083                            CP_COND_REG_EXEC_0_SYSMEM |
1084                            CP_COND_REG_EXEC_0_BINNING);
1085    }
1086 
1087    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
1088 
1089    tu_cs_emit_wfi(cs);
1090 
1091    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1092    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1093                   CP_REG_TO_MEM_0_CNT(2) |
1094                   CP_REG_TO_MEM_0_64B);
1095    tu_cs_emit_qw(cs, begin_iova);
1096 
1097    if (cmdbuf->state.pass) {
1098       tu_cond_exec_end(cs);
1099    }
1100 }
1101 
1102 template <chip CHIP>
1103 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)1104 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
1105                            VkQueryPool queryPool,
1106                            uint32_t query,
1107                            VkQueryControlFlags flags,
1108                            uint32_t index)
1109 {
1110    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1111    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1112    assert(query < pool->size);
1113 
1114    switch (pool->vk.query_type) {
1115    case VK_QUERY_TYPE_OCCLUSION:
1116       /* In freedreno, there is no implementation difference between
1117        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
1118        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
1119        */
1120       emit_begin_occlusion_query<CHIP>(cmdbuf, pool, query);
1121       break;
1122    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1123       emit_begin_xfb_query<CHIP>(cmdbuf, pool, query, index);
1124       break;
1125    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1126       emit_begin_prim_generated_query<CHIP>(cmdbuf, pool, query);
1127       break;
1128    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1129       emit_begin_perf_query(cmdbuf, pool, query);
1130       break;
1131    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1132       emit_begin_stat_query<CHIP>(cmdbuf, pool, query);
1133       break;
1134    case VK_QUERY_TYPE_TIMESTAMP:
1135       unreachable("Unimplemented query type");
1136    default:
1137       assert(!"Invalid query type");
1138    }
1139 }
1140 TU_GENX(tu_CmdBeginQueryIndexedEXT);
1141 
1142 template <chip CHIP>
1143 static void
emit_end_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1144 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
1145                          struct tu_query_pool *pool,
1146                          uint32_t query)
1147 {
1148    /* Ending an occlusion query happens in a few steps:
1149     *    1) Set the slot->end to UINT64_MAX.
1150     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
1151     *       write the current sample count value into slot->end.
1152     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
1153     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
1154     *    4) Accumulate the results of the query (slot->end - slot->begin) into
1155     *       slot->result.
1156     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
1157     *       pass, set the slot's available bit since the query is now done.
1158     *    6) If vkCmdEndQuery *is* called from within the scope of a render
1159     *       pass, we cannot mark as available yet since the commands in
1160     *       draw_cs are not run until vkCmdEndRenderPass.
1161     */
1162    const struct tu_render_pass *pass = cmdbuf->state.pass;
1163    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1164 
1165    struct tu_cs *epilogue_cs = &cmdbuf->cs;
1166    if (pass)
1167       /* Technically, queries should be tracked per-subpass, but here we track
1168        * at the render pass level to simply the code a bit. This is safe
1169        * because the only commands that use the available bit are
1170        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
1171        * cannot be invoked from inside a render pass scope.
1172        */
1173       epilogue_cs = &cmdbuf->draw_epilogue_cs;
1174 
1175    uint64_t available_iova = query_available_iova(pool, query);
1176    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
1177    uint64_t result_iova = occlusion_query_iova(pool, query, result);
1178    uint64_t end_iova = occlusion_query_iova(pool, query, end);
1179 
1180    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
1181       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1182       tu_cs_emit_qw(cs, end_iova);
1183       tu_cs_emit_qw(cs, 0xffffffffffffffffull);
1184 
1185       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1186    }
1187 
1188    tu_cs_emit_regs(cs,
1189                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
1190 
1191    if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
1192       tu_cs_emit_regs(cs,
1193                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
1194       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1195       tu_cs_emit(cs, ZPASS_DONE);
1196       if (CHIP == A7XX) {
1197          /* Copied from blob's cmdstream, not sure why it is done. */
1198          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1199          tu_cs_emit(cs, CCU_CLEAN_DEPTH);
1200       }
1201 
1202       tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1203       tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
1204                      CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
1205       tu_cs_emit_qw(cs, end_iova);
1206       tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
1207       tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1208       tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1209 
1210       /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
1211       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1212       tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
1213       tu_cs_emit_qw(cs, result_iova);
1214       tu_cs_emit_qw(cs, result_iova);
1215       tu_cs_emit_qw(cs, end_iova);
1216       tu_cs_emit_qw(cs, begin_iova);
1217 
1218       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1219    } else {
1220       /* When outside of renderpass, potential autotuner activity can cause
1221        * interference between ZPASS_DONE event pairs. In that case, like at the
1222        * beginning of the occlusion query, a fake ZPASS_DONE event is emitted to
1223        * compose a begin-end event pair. The first event will write into the end
1224        * field, but that will be overwritten by the second ZPASS_DONE which will
1225        * also handle the diff accumulation.
1226        */
1227       if (!cmdbuf->state.pass) {
1228          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
1229          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
1230                                           .write_sample_count = true).value);
1231          tu_cs_emit_qw(cs, end_iova);
1232       }
1233 
1234       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
1235       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
1236                                        .write_sample_count = true,
1237                                        .sample_count_end_offset = true,
1238                                        .write_accum_sample_count_diff = true).value);
1239       tu_cs_emit_qw(cs, begin_iova);
1240 
1241       tu_cs_emit_wfi(cs);
1242 
1243       if (cmdbuf->device->physical_device->info->a7xx.has_generic_clear) {
1244          /* If the next renderpass uses the same depth attachment, clears it
1245           * with generic clear - ZPASS_DONE may somehow read stale values that
1246           * are apparently invalidated by CCU_INVALIDATE_DEPTH.
1247           * See dEQP-VK.fragment_operations.early_fragment.sample_count_early_fragment_tests_depth_*
1248           */
1249          tu_emit_event_write<CHIP>(cmdbuf, epilogue_cs,
1250                                    FD_CCU_INVALIDATE_DEPTH);
1251       }
1252    }
1253 
1254    tu_cs_emit_pkt7(epilogue_cs, CP_MEM_WRITE, 4);
1255    tu_cs_emit_qw(epilogue_cs, available_iova);
1256    tu_cs_emit_qw(epilogue_cs, 0x1);
1257 }
1258 
1259 /* PRIMITIVE_CTRS is used for two distinct queries:
1260  * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
1261  * - VK_QUERY_TYPE_PIPELINE_STATISTICS
1262  * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
1263  * only for outer query.
1264  *
1265  * Also, pipeline stat query could run outside of renderpass and prim gen
1266  * query inside of secondary cmd buffer - for such case we ought to track
1267  * the status of pipeline stats query.
1268  */
1269 template <chip CHIP>
1270 static void
emit_stop_primitive_ctrs(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,VkQueryType query_type)1271 emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
1272                          struct tu_cs *cs,
1273                          VkQueryType query_type)
1274 {
1275    bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
1276    cmdbuf->state.prim_counters_running--;
1277    if (cmdbuf->state.prim_counters_running == 0) {
1278       bool need_cond_exec =
1279          is_secondary &&
1280          query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
1281          is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
1282 
1283       if (!need_cond_exec) {
1284          tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1285       } else {
1286          tu_cs_reserve(cs, 7 + 2);
1287          /* Check that pipeline stats query is not running, only then
1288           * we count stop the counter.
1289           */
1290          tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
1291          tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1292          tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1293          tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
1294          tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
1295 
1296          tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1297       }
1298    }
1299 
1300    if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1301       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1302       tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1303       tu_cs_emit(cs, 1);
1304    }
1305 }
1306 
1307 template <chip CHIP>
1308 static void
emit_end_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1309 emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
1310                     struct tu_query_pool *pool,
1311                     uint32_t query)
1312 {
1313    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1314    uint64_t end_iova = pipeline_stat_query_iova(pool, query, end, 0);
1315    uint64_t available_iova = query_available_iova(pool, query);
1316    uint64_t result_iova;
1317    uint64_t stat_start_iova;
1318    uint64_t stat_stop_iova;
1319 
1320    if (is_pipeline_query_with_vertex_stage(pool->vk.pipeline_statistics)) {
1321       /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
1322        * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
1323        * renderpass, because it is already stopped.
1324        */
1325       emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
1326    }
1327 
1328    if (is_pipeline_query_with_fragment_stage(pool->vk.pipeline_statistics)) {
1329       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_FRAGMENT_CTRS);
1330    }
1331 
1332    if (is_pipeline_query_with_compute_stage(pool->vk.pipeline_statistics)) {
1333       tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_COMPUTE_CTRS);
1334    }
1335 
1336    tu_cs_emit_wfi(cs);
1337 
1338    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1339    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
1340                   CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
1341                   CP_REG_TO_MEM_0_64B);
1342    tu_cs_emit_qw(cs, end_iova);
1343 
1344    for (int i = 0; i < STAT_COUNT; i++) {
1345       result_iova = query_result_iova(pool, query, uint64_t, i);
1346       stat_start_iova = pipeline_stat_query_iova(pool, query, begin, i);
1347       stat_stop_iova = pipeline_stat_query_iova(pool, query, end, i);
1348 
1349       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1350       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1351                      CP_MEM_TO_MEM_0_DOUBLE |
1352                      CP_MEM_TO_MEM_0_NEG_C);
1353 
1354       tu_cs_emit_qw(cs, result_iova);
1355       tu_cs_emit_qw(cs, result_iova);
1356       tu_cs_emit_qw(cs, stat_stop_iova);
1357       tu_cs_emit_qw(cs, stat_start_iova);
1358    }
1359 
1360    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1361 
1362    if (cmdbuf->state.pass)
1363       cs = &cmdbuf->draw_epilogue_cs;
1364 
1365    /* Set the availability to 1 */
1366    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1367    tu_cs_emit_qw(cs, available_iova);
1368    tu_cs_emit_qw(cs, 0x1);
1369 }
1370 
1371 static void
emit_end_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1372 emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
1373                          struct tu_query_pool *pool,
1374                          uint32_t query)
1375 {
1376    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1377    uint64_t available_iova = query_available_iova(pool, query);
1378    uint64_t end_iova;
1379    uint64_t begin_iova;
1380    uint64_t result_iova;
1381    uint32_t last_pass = ~0;
1382 
1383    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1384       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1385 
1386       if (last_pass != data->pass) {
1387          last_pass = data->pass;
1388 
1389          if (data->pass != 0)
1390             tu_cond_exec_end(cs);
1391          emit_perfcntrs_pass_start(cs, data->pass);
1392       }
1393 
1394       const struct fd_perfcntr_counter *counter =
1395             &pool->perf_group[data->gid].counters[data->cntr_reg];
1396 
1397       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1398 
1399       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1400       tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1401                      CP_REG_TO_MEM_0_64B);
1402       tu_cs_emit_qw(cs, end_iova);
1403    }
1404    tu_cond_exec_end(cs);
1405 
1406    last_pass = ~0;
1407    tu_cs_emit_wfi(cs);
1408 
1409    for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1410       struct tu_perf_query_data *data = &pool->perf_query_data[i];
1411 
1412       if (last_pass != data->pass) {
1413          last_pass = data->pass;
1414 
1415 
1416          if (data->pass != 0)
1417             tu_cond_exec_end(cs);
1418          emit_perfcntrs_pass_start(cs, data->pass);
1419       }
1420 
1421       result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
1422              data->app_idx);
1423       begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1424       end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1425 
1426       /* result += end - begin */
1427       tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1428       tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1429                      CP_MEM_TO_MEM_0_DOUBLE |
1430                      CP_MEM_TO_MEM_0_NEG_C);
1431 
1432       tu_cs_emit_qw(cs, result_iova);
1433       tu_cs_emit_qw(cs, result_iova);
1434       tu_cs_emit_qw(cs, end_iova);
1435       tu_cs_emit_qw(cs, begin_iova);
1436    }
1437    tu_cond_exec_end(cs);
1438 
1439    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1440 
1441    if (cmdbuf->state.pass)
1442       cs = &cmdbuf->draw_epilogue_cs;
1443 
1444    /* Set the availability to 1 */
1445    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1446    tu_cs_emit_qw(cs, available_iova);
1447    tu_cs_emit_qw(cs, 0x1);
1448 }
1449 
1450 template <chip CHIP>
1451 static void
emit_end_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1452 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
1453                    struct tu_query_pool *pool,
1454                    uint32_t query,
1455                    uint32_t stream_id)
1456 {
1457    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1458 
1459    uint64_t end_iova = primitive_query_iova(pool, query, end, 0, 0);
1460    uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
1461    uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
1462    uint64_t begin_written_iova = primitive_query_iova(pool, query, begin, stream_id, 0);
1463    uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin, stream_id, 1);
1464    uint64_t end_written_iova = primitive_query_iova(pool, query, end, stream_id, 0);
1465    uint64_t end_generated_iova = primitive_query_iova(pool, query, end, stream_id, 1);
1466    uint64_t available_iova = query_available_iova(pool, query);
1467 
1468    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
1469    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1470 
1471    tu_cs_emit_wfi(cs);
1472    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_CLEAN);
1473 
1474    /* Set the count of written primitives */
1475    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1476    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1477                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1478    tu_cs_emit_qw(cs, result_written_iova);
1479    tu_cs_emit_qw(cs, result_written_iova);
1480    tu_cs_emit_qw(cs, end_written_iova);
1481    tu_cs_emit_qw(cs, begin_written_iova);
1482 
1483    tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_CLEAN);
1484 
1485    /* Set the count of generated primitives */
1486    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1487    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1488                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1489    tu_cs_emit_qw(cs, result_generated_iova);
1490    tu_cs_emit_qw(cs, result_generated_iova);
1491    tu_cs_emit_qw(cs, end_generated_iova);
1492    tu_cs_emit_qw(cs, begin_generated_iova);
1493 
1494    /* Set the availability to 1 */
1495    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1496    tu_cs_emit_qw(cs, available_iova);
1497    tu_cs_emit_qw(cs, 0x1);
1498 }
1499 
1500 template <chip CHIP>
1501 static void
emit_end_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1502 emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1503                               struct tu_query_pool *pool,
1504                               uint32_t query)
1505 {
1506    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1507 
1508    if (!cmdbuf->state.pass) {
1509       cmdbuf->state.prim_generated_query_running_before_rp = false;
1510    }
1511 
1512    uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1513    uint64_t end_iova = primitives_generated_query_iova(pool, query, end);
1514    uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
1515    uint64_t available_iova = query_available_iova(pool, query);
1516 
1517    if (cmdbuf->state.pass) {
1518       tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1519                              CP_COND_REG_EXEC_0_SYSMEM |
1520                              CP_COND_REG_EXEC_0_BINNING);
1521    }
1522 
1523    tu_cs_emit_wfi(cs);
1524 
1525    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1526    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1527                   CP_REG_TO_MEM_0_CNT(2) |
1528                   CP_REG_TO_MEM_0_64B);
1529    tu_cs_emit_qw(cs, end_iova);
1530 
1531    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1532    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1533                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
1534    tu_cs_emit_qw(cs, result_iova);
1535    tu_cs_emit_qw(cs, result_iova);
1536    tu_cs_emit_qw(cs, end_iova);
1537    tu_cs_emit_qw(cs, begin_iova);
1538 
1539    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1540 
1541    /* Should be after waiting for mem writes to have up to date info
1542     * about which query is running.
1543     */
1544    emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1545 
1546    if (cmdbuf->state.pass) {
1547       tu_cond_exec_end(cs);
1548    }
1549 
1550    if (cmdbuf->state.pass)
1551       cs = &cmdbuf->draw_epilogue_cs;
1552 
1553    /* Set the availability to 1 */
1554    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1555    tu_cs_emit_qw(cs, available_iova);
1556    tu_cs_emit_qw(cs, 0x1);
1557 }
1558 
1559 /* Implement this bit of spec text from section 17.2 "Query Operation":
1560  *
1561  *     If queries are used while executing a render pass instance that has
1562  *     multiview enabled, the query uses N consecutive query indices in the
1563  *     query pool (starting at query) where N is the number of bits set in the
1564  *     view mask in the subpass the query is used in. How the numerical
1565  *     results of the query are distributed among the queries is
1566  *     implementation-dependent. For example, some implementations may write
1567  *     each view’s results to a distinct query, while other implementations
1568  *     may write the total result to the first query and write zero to the
1569  *     other queries. However, the sum of the results in all the queries must
1570  *     accurately reflect the total result of the query summed over all views.
1571  *     Applications can sum the results from all the queries to compute the
1572  *     total result.
1573  *
1574  * Since we execute all views at once, we write zero to the other queries.
1575  * Furthermore, because queries must be reset before use, and we set the
1576  * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
1577  */
1578 
1579 static void
handle_multiview_queries(struct tu_cmd_buffer * cmd,struct tu_query_pool * pool,uint32_t query)1580 handle_multiview_queries(struct tu_cmd_buffer *cmd,
1581                          struct tu_query_pool *pool,
1582                          uint32_t query)
1583 {
1584    if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
1585       return;
1586 
1587    unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
1588    struct tu_cs *cs = &cmd->draw_epilogue_cs;
1589 
1590    for (uint32_t i = 1; i < views; i++) {
1591       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1592       tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
1593       tu_cs_emit_qw(cs, 0x1);
1594    }
1595 }
1596 
1597 template <chip CHIP>
1598 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1599 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
1600                          VkQueryPool queryPool,
1601                          uint32_t query,
1602                          uint32_t index)
1603 {
1604    VK_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1605    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1606    assert(query < pool->size);
1607 
1608    switch (pool->vk.query_type) {
1609    case VK_QUERY_TYPE_OCCLUSION:
1610       emit_end_occlusion_query<CHIP>(cmdbuf, pool, query);
1611       break;
1612    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1613       assert(index <= 4);
1614       emit_end_xfb_query<CHIP>(cmdbuf, pool, query, index);
1615       break;
1616    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1617       emit_end_prim_generated_query<CHIP>(cmdbuf, pool, query);
1618       break;
1619    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1620       emit_end_perf_query(cmdbuf, pool, query);
1621       break;
1622    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1623       emit_end_stat_query<CHIP>(cmdbuf, pool, query);
1624       break;
1625    case VK_QUERY_TYPE_TIMESTAMP:
1626       unreachable("Unimplemented query type");
1627    default:
1628       assert(!"Invalid query type");
1629    }
1630 
1631    handle_multiview_queries(cmdbuf, pool, query);
1632 }
1633 TU_GENX(tu_CmdEndQueryIndexedEXT);
1634 
1635 VKAPI_ATTR void VKAPI_CALL
tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits2 pipelineStage,VkQueryPool queryPool,uint32_t query)1636 tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
1637                       VkPipelineStageFlagBits2 pipelineStage,
1638                       VkQueryPool queryPool,
1639                       uint32_t query)
1640 {
1641    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1642    VK_FROM_HANDLE(tu_query_pool, pool, queryPool);
1643 
1644    /* Inside a render pass, just write the timestamp multiple times so that
1645     * the user gets the last one if we use GMEM. There isn't really much
1646     * better we can do, and this seems to be what the blob does too.
1647     */
1648    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
1649 
1650    /* Stages that will already have been executed by the time the CP executes
1651     * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1652     * indirect stage counts as top-of-pipe too.
1653     */
1654    VkPipelineStageFlags2 top_of_pipe_flags =
1655       VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1656       VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
1657 
1658    if (pipelineStage & ~top_of_pipe_flags) {
1659       /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1660        * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1661        * complete.
1662        *
1663        * Stalling the CP like this is really unfortunate, but I don't think
1664        * there's a better solution that allows all 48 bits of precision
1665        * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1666        */
1667       tu_cs_emit_wfi(cs);
1668    }
1669 
1670    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1671    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) |
1672                   CP_REG_TO_MEM_0_CNT(2) |
1673                   CP_REG_TO_MEM_0_64B);
1674    tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1675 
1676    /* Only flag availability once the entire renderpass is done, similar to
1677     * the begin/end path.
1678     */
1679    cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
1680 
1681    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1682    tu_cs_emit_qw(cs, query_available_iova(pool, query));
1683    tu_cs_emit_qw(cs, 0x1);
1684 
1685    /* From the spec for vkCmdWriteTimestamp:
1686     *
1687     *    If vkCmdWriteTimestamp is called while executing a render pass
1688     *    instance that has multiview enabled, the timestamp uses N consecutive
1689     *    query indices in the query pool (starting at query) where N is the
1690     *    number of bits set in the view mask of the subpass the command is
1691     *    executed in. The resulting query values are determined by an
1692     *    implementation-dependent choice of one of the following behaviors:
1693     *
1694     *    -   The first query is a timestamp value and (if more than one bit is
1695     *        set in the view mask) zero is written to the remaining queries.
1696     *        If two timestamps are written in the same subpass, the sum of the
1697     *        execution time of all views between those commands is the
1698     *        difference between the first query written by each command.
1699     *
1700     *    -   All N queries are timestamp values. If two timestamps are written
1701     *        in the same subpass, the sum of the execution time of all views
1702     *        between those commands is the sum of the difference between
1703     *        corresponding queries written by each command. The difference
1704     *        between corresponding queries may be the execution time of a
1705     *        single view.
1706     *
1707     * We execute all views in the same draw call, so we implement the first
1708     * option, the same as regular queries.
1709     */
1710    handle_multiview_queries(cmd, pool, query);
1711 }
1712 
1713 VKAPI_ATTR VkResult VKAPI_CALL
tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1714 tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1715     VkPhysicalDevice                            physicalDevice,
1716     uint32_t                                    queueFamilyIndex,
1717     uint32_t*                                   pCounterCount,
1718     VkPerformanceCounterKHR*                    pCounters,
1719     VkPerformanceCounterDescriptionKHR*         pCounterDescriptions)
1720 {
1721    VK_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1722 
1723    uint32_t desc_count = *pCounterCount;
1724    uint32_t group_count;
1725    const struct fd_perfcntr_group *group =
1726          fd_perfcntrs(&phydev->dev_id, &group_count);
1727 
1728    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
1729    VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
1730                           pCounterDescriptions, &desc_count);
1731 
1732    for (int i = 0; i < group_count; i++) {
1733       for (int j = 0; j < group[i].num_countables; j++) {
1734 
1735          vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1736             counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
1737             counter->unit =
1738                   fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
1739             counter->storage =
1740                   fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
1741 
1742             unsigned char sha1_result[20];
1743             _mesa_sha1_compute(group[i].countables[j].name,
1744                                strlen(group[i].countables[j].name),
1745                                sha1_result);
1746             memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1747          }
1748 
1749          vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
1750             desc->flags = 0;
1751 
1752             snprintf(desc->name, sizeof(desc->name),
1753                      "%s", group[i].countables[j].name);
1754             snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
1755             snprintf(desc->description, sizeof(desc->description),
1756                      "%s: %s performance counter",
1757                      group[i].name, group[i].countables[j].name);
1758          }
1759       }
1760    }
1761 
1762    return vk_outarray_status(&out);
1763 }
1764 
1765 VKAPI_ATTR void VKAPI_CALL
tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1766 tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1767       VkPhysicalDevice                            physicalDevice,
1768       const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
1769       uint32_t*                                   pNumPasses)
1770 {
1771    VK_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1772    uint32_t group_count = 0;
1773    uint32_t gid = 0, cid = 0, n_passes;
1774    const struct fd_perfcntr_group *group =
1775          fd_perfcntrs(&phydev->dev_id, &group_count);
1776 
1777    uint32_t counters_requested[group_count];
1778    memset(counters_requested, 0x0, sizeof(counters_requested));
1779    *pNumPasses = 1;
1780 
1781    for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
1782       perfcntr_index(group, group_count,
1783                      pPerformanceQueryCreateInfo->pCounterIndices[i],
1784                      &gid, &cid);
1785 
1786       counters_requested[gid]++;
1787    }
1788 
1789    for (uint32_t i = 0; i < group_count; i++) {
1790       n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
1791       *pNumPasses = MAX2(*pNumPasses, n_passes);
1792    }
1793 }
1794 
1795 VKAPI_ATTR VkResult VKAPI_CALL
tu_AcquireProfilingLockKHR(VkDevice device,const VkAcquireProfilingLockInfoKHR * pInfo)1796 tu_AcquireProfilingLockKHR(VkDevice device,
1797                            const VkAcquireProfilingLockInfoKHR* pInfo)
1798 {
1799    /* TODO. Probably there's something to do for kgsl. */
1800    return VK_SUCCESS;
1801 }
1802 
1803 VKAPI_ATTR void VKAPI_CALL
tu_ReleaseProfilingLockKHR(VkDevice device)1804 tu_ReleaseProfilingLockKHR(VkDevice device)
1805 {
1806    /* TODO. Probably there's something to do for kgsl. */
1807    return;
1808 }
1809