1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 *
5 * Based on anv:
6 * Copyright © 2015 Intel Corporation
7 */
8
9 #include "tu_query.h"
10
11 #include <fcntl.h>
12
13 #include "nir/nir_builder.h"
14 #include "util/os_time.h"
15
16 #include "vk_util.h"
17
18 #include "tu_cmd_buffer.h"
19 #include "tu_cs.h"
20 #include "tu_device.h"
21
22 #define NSEC_PER_SEC 1000000000ull
23 #define WAIT_TIMEOUT 5
24 #define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
25
26 struct PACKED query_slot {
27 uint64_t available;
28 };
29
30 struct PACKED occlusion_slot_value {
31 /* Seems sample counters are placed to be 16-byte aligned
32 * even though this query needs an 8-byte slot. */
33 uint64_t value;
34 uint64_t _padding;
35 };
36
37 struct PACKED occlusion_query_slot {
38 struct query_slot common;
39 uint64_t result;
40
41 struct occlusion_slot_value begin;
42 struct occlusion_slot_value end;
43 };
44
45 struct PACKED timestamp_query_slot {
46 struct query_slot common;
47 uint64_t result;
48 };
49
50 struct PACKED primitive_slot_value {
51 uint64_t values[2];
52 };
53
54 struct PACKED pipeline_stat_query_slot {
55 struct query_slot common;
56 uint64_t results[STAT_COUNT];
57
58 uint64_t begin[STAT_COUNT];
59 uint64_t end[STAT_COUNT];
60 };
61
62 struct PACKED primitive_query_slot {
63 struct query_slot common;
64 /* The result of transform feedback queries is two integer values:
65 * results[0] is the count of primitives written,
66 * results[1] is the count of primitives generated.
67 * Also a result for each stream is stored at 4 slots respectively.
68 */
69 uint64_t results[2];
70
71 /* Primitive counters also need to be 16-byte aligned. */
72 uint64_t _padding;
73
74 struct primitive_slot_value begin[4];
75 struct primitive_slot_value end[4];
76 };
77
78 struct PACKED perfcntr_query_slot {
79 uint64_t result;
80 uint64_t begin;
81 uint64_t end;
82 };
83
84 struct PACKED perf_query_slot {
85 struct query_slot common;
86 struct perfcntr_query_slot perfcntr;
87 };
88
89 struct PACKED primitives_generated_query_slot {
90 struct query_slot common;
91 uint64_t result;
92 uint64_t begin;
93 uint64_t end;
94 };
95
96 /* Returns the IOVA of a given uint64_t field in a given slot of a query
97 * pool. */
98 #define query_iova(type, pool, query, field) \
99 pool->bo->iova + pool->stride * (query) + offsetof(type, field)
100
101 #define occlusion_query_iova(pool, query, field) \
102 query_iova(struct occlusion_query_slot, pool, query, field)
103
104 #define pipeline_stat_query_iova(pool, query, field) \
105 pool->bo->iova + pool->stride * (query) + \
106 offsetof(struct pipeline_stat_query_slot, field)
107
108 #define primitive_query_iova(pool, query, field, i) \
109 query_iova(struct primitive_query_slot, pool, query, field) + \
110 offsetof(struct primitive_slot_value, values[i])
111
112 #define perf_query_iova(pool, query, field, i) \
113 pool->bo->iova + pool->stride * (query) + \
114 sizeof(struct query_slot) + \
115 sizeof(struct perfcntr_query_slot) * (i) + \
116 offsetof(struct perfcntr_query_slot, field)
117
118 #define primitives_generated_query_iova(pool, query, field) \
119 query_iova(struct primitives_generated_query_slot, pool, query, field)
120
121 #define query_available_iova(pool, query) \
122 query_iova(struct query_slot, pool, query, available)
123
124 #define query_result_iova(pool, query, type, i) \
125 pool->bo->iova + pool->stride * (query) + \
126 sizeof(struct query_slot) + sizeof(type) * (i)
127
128 #define query_result_addr(pool, query, type, i) \
129 pool->bo->map + pool->stride * (query) + \
130 sizeof(struct query_slot) + sizeof(type) * (i)
131
132 #define query_is_available(slot) slot->available
133
134 static const VkPerformanceCounterUnitKHR
135 fd_perfcntr_type_to_vk_unit[] = {
136 [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
137 [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
138 [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
139 [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
140 [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
141 /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
142 [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
143 [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
144 [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
145 [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
146 [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
147 [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
148 [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
149 };
150
151 /* TODO. Basically this comes from the freedreno implementation where
152 * only UINT64 is used. We'd better confirm this by the blob vulkan driver
153 * when it starts supporting perf query.
154 */
155 static const VkPerformanceCounterStorageKHR
156 fd_perfcntr_type_to_vk_storage[] = {
157 [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
158 [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
159 [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
160 [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
161 [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
162 [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
163 [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
164 [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
165 [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
166 [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
167 [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
168 [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
169 };
170
171 /*
172 * Returns a pointer to a given slot in a query pool.
173 */
slot_address(struct tu_query_pool * pool,uint32_t query)174 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
175 {
176 return (char*)pool->bo->map + query * pool->stride;
177 }
178
179 static void
perfcntr_index(const struct fd_perfcntr_group * group,uint32_t group_count,uint32_t index,uint32_t * gid,uint32_t * cid)180 perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
181 uint32_t index, uint32_t *gid, uint32_t *cid)
182
183 {
184 uint32_t i;
185
186 for (i = 0; i < group_count; i++) {
187 if (group[i].num_countables > index) {
188 *gid = i;
189 *cid = index;
190 break;
191 }
192 index -= group[i].num_countables;
193 }
194
195 assert(i < group_count);
196 }
197
198 static int
compare_perfcntr_pass(const void * a,const void * b)199 compare_perfcntr_pass(const void *a, const void *b)
200 {
201 return ((struct tu_perf_query_data *)a)->pass -
202 ((struct tu_perf_query_data *)b)->pass;
203 }
204
205 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)206 tu_CreateQueryPool(VkDevice _device,
207 const VkQueryPoolCreateInfo *pCreateInfo,
208 const VkAllocationCallbacks *pAllocator,
209 VkQueryPool *pQueryPool)
210 {
211 TU_FROM_HANDLE(tu_device, device, _device);
212 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
213 assert(pCreateInfo->queryCount > 0);
214
215 uint32_t pool_size, slot_size;
216 const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
217
218 pool_size = sizeof(struct tu_query_pool);
219
220 switch (pCreateInfo->queryType) {
221 case VK_QUERY_TYPE_OCCLUSION:
222 slot_size = sizeof(struct occlusion_query_slot);
223 break;
224 case VK_QUERY_TYPE_TIMESTAMP:
225 slot_size = sizeof(struct timestamp_query_slot);
226 break;
227 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
228 slot_size = sizeof(struct primitive_query_slot);
229 break;
230 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
231 slot_size = sizeof(struct primitives_generated_query_slot);
232 break;
233 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
234 perf_query_info =
235 vk_find_struct_const(pCreateInfo->pNext,
236 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
237 assert(perf_query_info);
238
239 slot_size = sizeof(struct perf_query_slot) +
240 sizeof(struct perfcntr_query_slot) *
241 (perf_query_info->counterIndexCount - 1);
242
243 /* Size of the array pool->tu_perf_query_data */
244 pool_size += sizeof(struct tu_perf_query_data) *
245 perf_query_info->counterIndexCount;
246 break;
247 }
248 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
249 slot_size = sizeof(struct pipeline_stat_query_slot);
250 break;
251 default:
252 unreachable("Invalid query type");
253 }
254
255 struct tu_query_pool *pool =
256 vk_object_alloc(&device->vk, pAllocator, pool_size,
257 VK_OBJECT_TYPE_QUERY_POOL);
258 if (!pool)
259 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
260
261 if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
262 pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
263 &pool->perf_group_count);
264
265 pool->counter_index_count = perf_query_info->counterIndexCount;
266
267 /* Build all perf counters data that is requested, so we could get
268 * correct group id, countable id, counter register and pass index with
269 * only a counter index provided by applications at each command submit.
270 *
271 * Also, since this built data will be sorted by pass index later, we
272 * should keep the original indices and store perfcntrs results according
273 * to them so apps can get correct results with their own indices.
274 */
275 uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
276 memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
277 memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
278
279 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
280 uint32_t gid = 0, cid = 0;
281
282 perfcntr_index(pool->perf_group, pool->perf_group_count,
283 perf_query_info->pCounterIndices[i], &gid, &cid);
284
285 pool->perf_query_data[i].gid = gid;
286 pool->perf_query_data[i].cid = cid;
287 pool->perf_query_data[i].app_idx = i;
288
289 /* When a counter register is over the capacity(num_counters),
290 * reset it for next pass.
291 */
292 if (regs[gid] < pool->perf_group[gid].num_counters) {
293 pool->perf_query_data[i].cntr_reg = regs[gid]++;
294 pool->perf_query_data[i].pass = pass[gid];
295 } else {
296 pool->perf_query_data[i].pass = ++pass[gid];
297 pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
298 regs[gid]++;
299 }
300 }
301
302 /* Sort by pass index so we could easily prepare a command stream
303 * with the ascending order of pass index.
304 */
305 qsort(pool->perf_query_data, pool->counter_index_count,
306 sizeof(pool->perf_query_data[0]),
307 compare_perfcntr_pass);
308 }
309
310 VkResult result = tu_bo_init_new(device, &pool->bo,
311 pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS);
312 if (result != VK_SUCCESS) {
313 vk_object_free(&device->vk, pAllocator, pool);
314 return result;
315 }
316
317 result = tu_bo_map(device, pool->bo);
318 if (result != VK_SUCCESS) {
319 tu_bo_finish(device, pool->bo);
320 vk_object_free(&device->vk, pAllocator, pool);
321 return result;
322 }
323
324 /* Initialize all query statuses to unavailable */
325 memset(pool->bo->map, 0, pool->bo->size);
326
327 pool->type = pCreateInfo->queryType;
328 pool->stride = slot_size;
329 pool->size = pCreateInfo->queryCount;
330 pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
331 *pQueryPool = tu_query_pool_to_handle(pool);
332
333 return VK_SUCCESS;
334 }
335
336 VKAPI_ATTR void VKAPI_CALL
tu_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)337 tu_DestroyQueryPool(VkDevice _device,
338 VkQueryPool _pool,
339 const VkAllocationCallbacks *pAllocator)
340 {
341 TU_FROM_HANDLE(tu_device, device, _device);
342 TU_FROM_HANDLE(tu_query_pool, pool, _pool);
343
344 if (!pool)
345 return;
346
347 tu_bo_finish(device, pool->bo);
348 vk_object_free(&device->vk, pAllocator, pool);
349 }
350
351 static uint32_t
get_result_count(struct tu_query_pool * pool)352 get_result_count(struct tu_query_pool *pool)
353 {
354 switch (pool->type) {
355 /* Occulusion and timestamp queries write one integer value */
356 case VK_QUERY_TYPE_OCCLUSION:
357 case VK_QUERY_TYPE_TIMESTAMP:
358 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
359 return 1;
360 /* Transform feedback queries write two integer values */
361 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
362 return 2;
363 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
364 return util_bitcount(pool->pipeline_statistics);
365 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
366 return pool->counter_index_count;
367 default:
368 assert(!"Invalid query type");
369 return 0;
370 }
371 }
372
373 static uint32_t
statistics_index(uint32_t * statistics)374 statistics_index(uint32_t *statistics)
375 {
376 uint32_t stat;
377 stat = u_bit_scan(statistics);
378
379 switch (1 << stat) {
380 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
381 case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
382 return 0;
383 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
384 return 1;
385 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
386 return 2;
387 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
388 return 4;
389 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
390 return 5;
391 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
392 return 6;
393 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
394 return 7;
395 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
396 return 8;
397 case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
398 return 9;
399 case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
400 return 10;
401 default:
402 return 0;
403 }
404 }
405
406 static bool
is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)407 is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
408 {
409 return pipeline_statistics &
410 (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
411 VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
412 VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
413 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
414 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
415 VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
416 VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
417 VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
418 VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT);
419 }
420
421 static bool
is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)422 is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
423 {
424 return pipeline_statistics &
425 VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
426 }
427
428 static bool
is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)429 is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
430 {
431 return pipeline_statistics &
432 VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
433 }
434
435 /* Wait on the the availability status of a query up until a timeout. */
436 static VkResult
wait_for_available(struct tu_device * device,struct tu_query_pool * pool,uint32_t query)437 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
438 uint32_t query)
439 {
440 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
441 * scheduler friendly way instead of busy polling once the patch has landed
442 * upstream. */
443 struct query_slot *slot = slot_address(pool, query);
444 uint64_t abs_timeout = os_time_get_absolute_timeout(
445 WAIT_TIMEOUT * NSEC_PER_SEC);
446 while(os_time_get_nano() < abs_timeout) {
447 if (query_is_available(slot))
448 return VK_SUCCESS;
449 }
450 return vk_error(device, VK_TIMEOUT);
451 }
452
453 /* Writes a query value to a buffer from the CPU. */
454 static void
write_query_value_cpu(char * base,uint32_t offset,uint64_t value,VkQueryResultFlags flags)455 write_query_value_cpu(char* base,
456 uint32_t offset,
457 uint64_t value,
458 VkQueryResultFlags flags)
459 {
460 if (flags & VK_QUERY_RESULT_64_BIT) {
461 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
462 } else {
463 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
464 }
465 }
466
467 static VkResult
get_query_pool_results(struct tu_device * device,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)468 get_query_pool_results(struct tu_device *device,
469 struct tu_query_pool *pool,
470 uint32_t firstQuery,
471 uint32_t queryCount,
472 size_t dataSize,
473 void *pData,
474 VkDeviceSize stride,
475 VkQueryResultFlags flags)
476 {
477 assert(dataSize >= stride * queryCount);
478
479 char *result_base = pData;
480 VkResult result = VK_SUCCESS;
481 for (uint32_t i = 0; i < queryCount; i++) {
482 uint32_t query = firstQuery + i;
483 struct query_slot *slot = slot_address(pool, query);
484 bool available = query_is_available(slot);
485 uint32_t result_count = get_result_count(pool);
486 uint32_t statistics = pool->pipeline_statistics;
487
488 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
489 VkResult wait_result = wait_for_available(device, pool, query);
490 if (wait_result != VK_SUCCESS)
491 return wait_result;
492 available = true;
493 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
494 /* From the Vulkan 1.1.130 spec:
495 *
496 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
497 * both not set then no result values are written to pData for
498 * queries that are in the unavailable state at the time of the
499 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
500 * availability state is still written to pData for those queries
501 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
502 */
503 result = VK_NOT_READY;
504 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
505 result_base += stride;
506 continue;
507 }
508 }
509
510 for (uint32_t k = 0; k < result_count; k++) {
511 if (available) {
512 uint64_t *result;
513
514 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
515 uint32_t stat_idx = statistics_index(&statistics);
516 result = query_result_addr(pool, query, uint64_t, stat_idx);
517 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
518 result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
519 } else {
520 result = query_result_addr(pool, query, uint64_t, k);
521 }
522
523 write_query_value_cpu(result_base, k, *result, flags);
524 } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
525 /* From the Vulkan 1.1.130 spec:
526 *
527 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
528 * is not set, and the query’s status is unavailable, an
529 * intermediate result value between zero and the final result
530 * value is written to pData for that query.
531 *
532 * Just return 0 here for simplicity since it's a valid result.
533 */
534 write_query_value_cpu(result_base, k, 0, flags);
535 }
536
537 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
538 /* From the Vulkan 1.1.130 spec:
539 *
540 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
541 * integer value written for each query is non-zero if the query’s
542 * status was available or zero if the status was unavailable.
543 */
544 write_query_value_cpu(result_base, result_count, available, flags);
545
546 result_base += stride;
547 }
548 return result;
549 }
550
551 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)552 tu_GetQueryPoolResults(VkDevice _device,
553 VkQueryPool queryPool,
554 uint32_t firstQuery,
555 uint32_t queryCount,
556 size_t dataSize,
557 void *pData,
558 VkDeviceSize stride,
559 VkQueryResultFlags flags)
560 {
561 TU_FROM_HANDLE(tu_device, device, _device);
562 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
563 assert(firstQuery + queryCount <= pool->size);
564
565 if (vk_device_is_lost(&device->vk))
566 return VK_ERROR_DEVICE_LOST;
567
568 switch (pool->type) {
569 case VK_QUERY_TYPE_OCCLUSION:
570 case VK_QUERY_TYPE_TIMESTAMP:
571 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
572 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
573 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
574 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
575 return get_query_pool_results(device, pool, firstQuery, queryCount,
576 dataSize, pData, stride, flags);
577 default:
578 assert(!"Invalid query type");
579 }
580 return VK_SUCCESS;
581 }
582
583 /* Copies a query value from one buffer to another from the GPU. */
584 static void
copy_query_value_gpu(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,uint64_t src_iova,uint64_t base_write_iova,uint32_t offset,VkQueryResultFlags flags)585 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
586 struct tu_cs *cs,
587 uint64_t src_iova,
588 uint64_t base_write_iova,
589 uint32_t offset,
590 VkQueryResultFlags flags) {
591 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
592 sizeof(uint64_t) : sizeof(uint32_t);
593 uint64_t write_iova = base_write_iova + (offset * element_size);
594
595 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
596 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
597 CP_MEM_TO_MEM_0_DOUBLE : 0;
598 tu_cs_emit(cs, mem_to_mem_flags);
599 tu_cs_emit_qw(cs, write_iova);
600 tu_cs_emit_qw(cs, src_iova);
601 }
602
603 static void
emit_copy_query_pool_results(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,struct tu_buffer * buffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)604 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
605 struct tu_cs *cs,
606 struct tu_query_pool *pool,
607 uint32_t firstQuery,
608 uint32_t queryCount,
609 struct tu_buffer *buffer,
610 VkDeviceSize dstOffset,
611 VkDeviceSize stride,
612 VkQueryResultFlags flags)
613 {
614 /* From the Vulkan 1.1.130 spec:
615 *
616 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
617 * uses of vkCmdResetQueryPool in the same queue, without any additional
618 * synchronization.
619 *
620 * To ensure that previous writes to the available bit are coherent, first
621 * wait for all writes to complete.
622 */
623 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
624
625 for (uint32_t i = 0; i < queryCount; i++) {
626 uint32_t query = firstQuery + i;
627 uint64_t available_iova = query_available_iova(pool, query);
628 uint64_t buffer_iova = buffer->iova + dstOffset + i * stride;
629 uint32_t result_count = get_result_count(pool);
630 uint32_t statistics = pool->pipeline_statistics;
631
632 /* Wait for the available bit to be set if executed with the
633 * VK_QUERY_RESULT_WAIT_BIT flag. */
634 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
635 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
636 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
637 CP_WAIT_REG_MEM_0_POLL_MEMORY);
638 tu_cs_emit_qw(cs, available_iova);
639 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
640 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
641 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
642 }
643
644 for (uint32_t k = 0; k < result_count; k++) {
645 uint64_t result_iova;
646
647 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
648 uint32_t stat_idx = statistics_index(&statistics);
649 result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
650 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
651 result_iova = query_result_iova(pool, query,
652 struct perfcntr_query_slot, k);
653 } else {
654 result_iova = query_result_iova(pool, query, uint64_t, k);
655 }
656
657 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
658 /* Unconditionally copying the bo->result into the buffer here is
659 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
660 * if the query is unavailable, this will copy the correct partial
661 * value of 0.
662 */
663 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
664 k /* offset */, flags);
665 } else {
666 /* Conditionally copy bo->result into the buffer based on whether the
667 * query is available.
668 *
669 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
670 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
671 * that 0 < available < 2, aka available == 1.
672 */
673 tu_cs_reserve(cs, 7 + 6);
674 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
675 tu_cs_emit_qw(cs, available_iova);
676 tu_cs_emit_qw(cs, available_iova);
677 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
678 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
679
680 /* Start of conditional execution */
681 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
682 k /* offset */, flags);
683 /* End of conditional execution */
684 }
685 }
686
687 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
688 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
689 result_count /* offset */, flags);
690 }
691 }
692 }
693
694 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)695 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
696 VkQueryPool queryPool,
697 uint32_t firstQuery,
698 uint32_t queryCount,
699 VkBuffer dstBuffer,
700 VkDeviceSize dstOffset,
701 VkDeviceSize stride,
702 VkQueryResultFlags flags)
703 {
704 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
705 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
706 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
707 struct tu_cs *cs = &cmdbuf->cs;
708 assert(firstQuery + queryCount <= pool->size);
709
710 switch (pool->type) {
711 case VK_QUERY_TYPE_OCCLUSION:
712 case VK_QUERY_TYPE_TIMESTAMP:
713 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
714 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
715 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
716 return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
717 queryCount, buffer, dstOffset, stride, flags);
718 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
719 unreachable("allowCommandBufferQueryCopies is false");
720 default:
721 assert(!"Invalid query type");
722 }
723 }
724
725 static void
emit_reset_query_pool(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount)726 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
727 struct tu_query_pool *pool,
728 uint32_t firstQuery,
729 uint32_t queryCount)
730 {
731 struct tu_cs *cs = &cmdbuf->cs;
732
733 for (uint32_t i = 0; i < queryCount; i++) {
734 uint32_t query = firstQuery + i;
735 uint32_t statistics = pool->pipeline_statistics;
736
737 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
738 tu_cs_emit_qw(cs, query_available_iova(pool, query));
739 tu_cs_emit_qw(cs, 0x0);
740
741 for (uint32_t k = 0; k < get_result_count(pool); k++) {
742 uint64_t result_iova;
743
744 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
745 uint32_t stat_idx = statistics_index(&statistics);
746 result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
747 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
748 result_iova = query_result_iova(pool, query,
749 struct perfcntr_query_slot, k);
750 } else {
751 result_iova = query_result_iova(pool, query, uint64_t, k);
752 }
753
754 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
755 tu_cs_emit_qw(cs, result_iova);
756 tu_cs_emit_qw(cs, 0x0);
757 }
758 }
759
760 }
761
762 VKAPI_ATTR void VKAPI_CALL
tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)763 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
764 VkQueryPool queryPool,
765 uint32_t firstQuery,
766 uint32_t queryCount)
767 {
768 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
769 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
770
771 switch (pool->type) {
772 case VK_QUERY_TYPE_TIMESTAMP:
773 case VK_QUERY_TYPE_OCCLUSION:
774 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
775 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
776 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
777 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
778 emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
779 break;
780 default:
781 assert(!"Invalid query type");
782 }
783 }
784
785 VKAPI_ATTR void VKAPI_CALL
tu_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)786 tu_ResetQueryPool(VkDevice device,
787 VkQueryPool queryPool,
788 uint32_t firstQuery,
789 uint32_t queryCount)
790 {
791 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
792
793 for (uint32_t i = 0; i < queryCount; i++) {
794 struct query_slot *slot = slot_address(pool, i + firstQuery);
795 slot->available = 0;
796
797 for (uint32_t k = 0; k < get_result_count(pool); k++) {
798 uint64_t *res;
799
800 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
801 res = query_result_addr(pool, i + firstQuery,
802 struct perfcntr_query_slot, k);
803 } else {
804 res = query_result_addr(pool, i + firstQuery, uint64_t, k);
805 }
806
807 *res = 0;
808 }
809 }
810 }
811
812 static void
emit_begin_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)813 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
814 struct tu_query_pool *pool,
815 uint32_t query)
816 {
817 /* From the Vulkan 1.1.130 spec:
818 *
819 * A query must begin and end inside the same subpass of a render pass
820 * instance, or must both begin and end outside of a render pass
821 * instance.
822 *
823 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
824 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
825 * query begins/ends inside the same subpass of a render pass, we need to
826 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
827 * is then run on every tile during render, so we just need to accumulate
828 * sample counts in slot->result to compute the query result.
829 */
830 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
831
832 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
833
834 tu_cs_emit_regs(cs,
835 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
836
837 tu_cs_emit_regs(cs,
838 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
839
840 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
841 tu_cs_emit(cs, ZPASS_DONE);
842 }
843
844 static void
emit_begin_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)845 emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
846 struct tu_query_pool *pool,
847 uint32_t query)
848 {
849 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
850 uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin);
851
852 if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
853 bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
854 cmdbuf->state.prim_counters_running++;
855
856 /* Prevent starting primitive counters when it is supposed to be stopped
857 * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
858 */
859 if (need_cond_exec) {
860 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
861 CP_COND_REG_EXEC_0_SYSMEM |
862 CP_COND_REG_EXEC_0_BINNING);
863 }
864
865 tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
866
867 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
868 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
869 tu_cs_emit(cs, 0);
870
871 if (need_cond_exec) {
872 tu_cond_exec_end(cs);
873 }
874 }
875
876 if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
877 tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS);
878 }
879
880 if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
881 tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS);
882 }
883
884 tu_cs_emit_wfi(cs);
885
886 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
887 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
888 CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
889 CP_REG_TO_MEM_0_64B);
890 tu_cs_emit_qw(cs, begin_iova);
891 }
892
893 static void
emit_perfcntrs_pass_start(struct tu_cs * cs,uint32_t pass)894 emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
895 {
896 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
897 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
898 REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
899 A6XX_CP_REG_TEST_0_BIT(pass) |
900 A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
901 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
902 }
903
904 static void
emit_begin_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)905 emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
906 struct tu_query_pool *pool,
907 uint32_t query)
908 {
909 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
910 uint32_t last_pass = ~0;
911
912 if (cmdbuf->state.pass) {
913 cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
914 }
915
916 /* Querying perf counters happens in these steps:
917 *
918 * 0) There's a scratch reg to set a pass index for perf counters query.
919 * Prepare cmd streams to set each pass index to the reg at device
920 * creation time. See tu_CreateDevice in tu_device.c
921 * 1) Emit command streams to read all requested perf counters at all
922 * passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
923 * reads the scratch reg where pass index is set.
924 * See emit_perfcntrs_pass_start.
925 * 2) Pick the right cs setting proper pass index to the reg and prepend
926 * it to the command buffer at each submit time.
927 * See tu_QueueSubmit in tu_drm.c
928 * 3) If the pass index in the reg is true, then executes the command
929 * stream below CP_COND_REG_EXEC.
930 */
931
932 tu_cs_emit_wfi(cs);
933
934 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
935 struct tu_perf_query_data *data = &pool->perf_query_data[i];
936
937 if (last_pass != data->pass) {
938 last_pass = data->pass;
939
940 if (data->pass != 0)
941 tu_cond_exec_end(cs);
942 emit_perfcntrs_pass_start(cs, data->pass);
943 }
944
945 const struct fd_perfcntr_counter *counter =
946 &pool->perf_group[data->gid].counters[data->cntr_reg];
947 const struct fd_perfcntr_countable *countable =
948 &pool->perf_group[data->gid].countables[data->cid];
949
950 tu_cs_emit_pkt4(cs, counter->select_reg, 1);
951 tu_cs_emit(cs, countable->selector);
952 }
953 tu_cond_exec_end(cs);
954
955 last_pass = ~0;
956 tu_cs_emit_wfi(cs);
957
958 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
959 struct tu_perf_query_data *data = &pool->perf_query_data[i];
960
961 if (last_pass != data->pass) {
962 last_pass = data->pass;
963
964 if (data->pass != 0)
965 tu_cond_exec_end(cs);
966 emit_perfcntrs_pass_start(cs, data->pass);
967 }
968
969 const struct fd_perfcntr_counter *counter =
970 &pool->perf_group[data->gid].counters[data->cntr_reg];
971
972 uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
973
974 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
975 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
976 CP_REG_TO_MEM_0_64B);
977 tu_cs_emit_qw(cs, begin_iova);
978 }
979 tu_cond_exec_end(cs);
980 }
981
982 static void
emit_begin_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)983 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
984 struct tu_query_pool *pool,
985 uint32_t query,
986 uint32_t stream_id)
987 {
988 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
989 uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
990
991 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
992 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
993 }
994
995 static void
emit_begin_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)996 emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
997 struct tu_query_pool *pool,
998 uint32_t query)
999 {
1000 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1001 uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1002
1003 if (cmdbuf->state.pass) {
1004 cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
1005 } else {
1006 cmdbuf->state.prim_generated_query_running_before_rp = true;
1007 }
1008
1009 cmdbuf->state.prim_counters_running++;
1010
1011 if (cmdbuf->state.pass) {
1012 /* Primitives that passed all tests are still counted in in each
1013 * tile even with HW binning beforehand. Do not permit it.
1014 */
1015 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1016 CP_COND_REG_EXEC_0_SYSMEM |
1017 CP_COND_REG_EXEC_0_BINNING);
1018 }
1019
1020 tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
1021
1022 tu_cs_emit_wfi(cs);
1023
1024 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1025 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1026 CP_REG_TO_MEM_0_CNT(2) |
1027 CP_REG_TO_MEM_0_64B);
1028 tu_cs_emit_qw(cs, begin_iova);
1029
1030 if (cmdbuf->state.pass) {
1031 tu_cond_exec_end(cs);
1032 }
1033 }
1034
1035 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1036 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
1037 VkQueryPool queryPool,
1038 uint32_t query,
1039 VkQueryControlFlags flags)
1040 {
1041 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1042 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1043 assert(query < pool->size);
1044
1045 switch (pool->type) {
1046 case VK_QUERY_TYPE_OCCLUSION:
1047 /* In freedreno, there is no implementation difference between
1048 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
1049 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
1050 */
1051 emit_begin_occlusion_query(cmdbuf, pool, query);
1052 break;
1053 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1054 emit_begin_xfb_query(cmdbuf, pool, query, 0);
1055 break;
1056 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1057 emit_begin_prim_generated_query(cmdbuf, pool, query);
1058 break;
1059 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1060 emit_begin_perf_query(cmdbuf, pool, query);
1061 break;
1062 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1063 emit_begin_stat_query(cmdbuf, pool, query);
1064 break;
1065 case VK_QUERY_TYPE_TIMESTAMP:
1066 unreachable("Unimplemented query type");
1067 default:
1068 assert(!"Invalid query type");
1069 }
1070 }
1071
1072 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)1073 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
1074 VkQueryPool queryPool,
1075 uint32_t query,
1076 VkQueryControlFlags flags,
1077 uint32_t index)
1078 {
1079 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1080 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1081 assert(query < pool->size);
1082
1083 switch (pool->type) {
1084 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1085 emit_begin_xfb_query(cmdbuf, pool, query, index);
1086 break;
1087 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1088 emit_begin_prim_generated_query(cmdbuf, pool, query);
1089 break;
1090 default:
1091 assert(!"Invalid query type");
1092 }
1093 }
1094
1095 static void
emit_end_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1096 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
1097 struct tu_query_pool *pool,
1098 uint32_t query)
1099 {
1100 /* Ending an occlusion query happens in a few steps:
1101 * 1) Set the slot->end to UINT64_MAX.
1102 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
1103 * write the current sample count value into slot->end.
1104 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
1105 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
1106 * 4) Accumulate the results of the query (slot->end - slot->begin) into
1107 * slot->result.
1108 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
1109 * pass, set the slot's available bit since the query is now done.
1110 * 6) If vkCmdEndQuery *is* called from within the scope of a render
1111 * pass, we cannot mark as available yet since the commands in
1112 * draw_cs are not run until vkCmdEndRenderPass.
1113 */
1114 const struct tu_render_pass *pass = cmdbuf->state.pass;
1115 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1116
1117 uint64_t available_iova = query_available_iova(pool, query);
1118 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
1119 uint64_t end_iova = occlusion_query_iova(pool, query, end);
1120 uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0);
1121 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1122 tu_cs_emit_qw(cs, end_iova);
1123 tu_cs_emit_qw(cs, 0xffffffffffffffffull);
1124
1125 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1126
1127 tu_cs_emit_regs(cs,
1128 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
1129
1130 tu_cs_emit_regs(cs,
1131 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
1132
1133 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1134 tu_cs_emit(cs, ZPASS_DONE);
1135
1136 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1137 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
1138 CP_WAIT_REG_MEM_0_POLL_MEMORY);
1139 tu_cs_emit_qw(cs, end_iova);
1140 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
1141 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1142 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1143
1144 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
1145 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1146 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
1147 tu_cs_emit_qw(cs, result_iova);
1148 tu_cs_emit_qw(cs, result_iova);
1149 tu_cs_emit_qw(cs, end_iova);
1150 tu_cs_emit_qw(cs, begin_iova);
1151
1152 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1153
1154 if (pass)
1155 /* Technically, queries should be tracked per-subpass, but here we track
1156 * at the render pass level to simply the code a bit. This is safe
1157 * because the only commands that use the available bit are
1158 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
1159 * cannot be invoked from inside a render pass scope.
1160 */
1161 cs = &cmdbuf->draw_epilogue_cs;
1162
1163 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1164 tu_cs_emit_qw(cs, available_iova);
1165 tu_cs_emit_qw(cs, 0x1);
1166 }
1167
1168 /* PRIMITIVE_CTRS is used for two distinct queries:
1169 * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
1170 * - VK_QUERY_TYPE_PIPELINE_STATISTICS
1171 * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
1172 * only for outer query.
1173 *
1174 * Also, pipeline stat query could run outside of renderpass and prim gen
1175 * query inside of secondary cmd buffer - for such case we ought to track
1176 * the status of pipeline stats query.
1177 */
1178 static void
emit_stop_primitive_ctrs(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,enum VkQueryType query_type)1179 emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
1180 struct tu_cs *cs,
1181 enum VkQueryType query_type)
1182 {
1183 bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
1184 cmdbuf->state.prim_counters_running--;
1185 if (cmdbuf->state.prim_counters_running == 0) {
1186 bool need_cond_exec =
1187 is_secondary &&
1188 query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
1189 is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
1190
1191 if (!need_cond_exec) {
1192 tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
1193 } else {
1194 tu_cs_reserve(cs, 7 + 2);
1195 /* Check that pipeline stats query is not running, only then
1196 * we count stop the counter.
1197 */
1198 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
1199 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1200 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1201 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
1202 tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
1203
1204 tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
1205 }
1206 }
1207
1208 if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1209 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1210 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1211 tu_cs_emit(cs, 1);
1212 }
1213 }
1214
1215 static void
emit_end_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1216 emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
1217 struct tu_query_pool *pool,
1218 uint32_t query)
1219 {
1220 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1221 uint64_t end_iova = pipeline_stat_query_iova(pool, query, end);
1222 uint64_t available_iova = query_available_iova(pool, query);
1223 uint64_t result_iova;
1224 uint64_t stat_start_iova;
1225 uint64_t stat_stop_iova;
1226
1227 if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
1228 /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
1229 * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
1230 * renderpass, because it is already stopped.
1231 */
1232 emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
1233 }
1234
1235 if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
1236 tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS);
1237 }
1238
1239 if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
1240 tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS);
1241 }
1242
1243 tu_cs_emit_wfi(cs);
1244
1245 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1246 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
1247 CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
1248 CP_REG_TO_MEM_0_64B);
1249 tu_cs_emit_qw(cs, end_iova);
1250
1251 for (int i = 0; i < STAT_COUNT; i++) {
1252 result_iova = query_result_iova(pool, query, uint64_t, i);
1253 stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]);
1254 stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]);
1255
1256 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1257 tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1258 CP_MEM_TO_MEM_0_DOUBLE |
1259 CP_MEM_TO_MEM_0_NEG_C);
1260
1261 tu_cs_emit_qw(cs, result_iova);
1262 tu_cs_emit_qw(cs, result_iova);
1263 tu_cs_emit_qw(cs, stat_stop_iova);
1264 tu_cs_emit_qw(cs, stat_start_iova);
1265 }
1266
1267 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1268
1269 if (cmdbuf->state.pass)
1270 cs = &cmdbuf->draw_epilogue_cs;
1271
1272 /* Set the availability to 1 */
1273 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1274 tu_cs_emit_qw(cs, available_iova);
1275 tu_cs_emit_qw(cs, 0x1);
1276 }
1277
1278 static void
emit_end_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1279 emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
1280 struct tu_query_pool *pool,
1281 uint32_t query)
1282 {
1283 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1284 uint64_t available_iova = query_available_iova(pool, query);
1285 uint64_t end_iova;
1286 uint64_t begin_iova;
1287 uint64_t result_iova;
1288 uint32_t last_pass = ~0;
1289
1290 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1291 struct tu_perf_query_data *data = &pool->perf_query_data[i];
1292
1293 if (last_pass != data->pass) {
1294 last_pass = data->pass;
1295
1296 if (data->pass != 0)
1297 tu_cond_exec_end(cs);
1298 emit_perfcntrs_pass_start(cs, data->pass);
1299 }
1300
1301 const struct fd_perfcntr_counter *counter =
1302 &pool->perf_group[data->gid].counters[data->cntr_reg];
1303
1304 end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1305
1306 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1307 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1308 CP_REG_TO_MEM_0_64B);
1309 tu_cs_emit_qw(cs, end_iova);
1310 }
1311 tu_cond_exec_end(cs);
1312
1313 last_pass = ~0;
1314 tu_cs_emit_wfi(cs);
1315
1316 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1317 struct tu_perf_query_data *data = &pool->perf_query_data[i];
1318
1319 if (last_pass != data->pass) {
1320 last_pass = data->pass;
1321
1322
1323 if (data->pass != 0)
1324 tu_cond_exec_end(cs);
1325 emit_perfcntrs_pass_start(cs, data->pass);
1326 }
1327
1328 result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
1329 data->app_idx);
1330 begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1331 end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1332
1333 /* result += end - begin */
1334 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1335 tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1336 CP_MEM_TO_MEM_0_DOUBLE |
1337 CP_MEM_TO_MEM_0_NEG_C);
1338
1339 tu_cs_emit_qw(cs, result_iova);
1340 tu_cs_emit_qw(cs, result_iova);
1341 tu_cs_emit_qw(cs, end_iova);
1342 tu_cs_emit_qw(cs, begin_iova);
1343 }
1344 tu_cond_exec_end(cs);
1345
1346 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1347
1348 if (cmdbuf->state.pass)
1349 cs = &cmdbuf->draw_epilogue_cs;
1350
1351 /* Set the availability to 1 */
1352 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1353 tu_cs_emit_qw(cs, available_iova);
1354 tu_cs_emit_qw(cs, 0x1);
1355 }
1356
1357 static void
emit_end_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1358 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
1359 struct tu_query_pool *pool,
1360 uint32_t query,
1361 uint32_t stream_id)
1362 {
1363 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1364
1365 uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
1366 uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
1367 uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
1368 uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
1369 uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
1370 uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
1371 uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
1372 uint64_t available_iova = query_available_iova(pool, query);
1373
1374 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
1375 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
1376
1377 tu_cs_emit_wfi(cs);
1378 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
1379
1380 /* Set the count of written primitives */
1381 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1382 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1383 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1384 tu_cs_emit_qw(cs, result_written_iova);
1385 tu_cs_emit_qw(cs, result_written_iova);
1386 tu_cs_emit_qw(cs, end_written_iova);
1387 tu_cs_emit_qw(cs, begin_written_iova);
1388
1389 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
1390
1391 /* Set the count of generated primitives */
1392 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1393 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1394 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1395 tu_cs_emit_qw(cs, result_generated_iova);
1396 tu_cs_emit_qw(cs, result_generated_iova);
1397 tu_cs_emit_qw(cs, end_generated_iova);
1398 tu_cs_emit_qw(cs, begin_generated_iova);
1399
1400 /* Set the availability to 1 */
1401 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1402 tu_cs_emit_qw(cs, available_iova);
1403 tu_cs_emit_qw(cs, 0x1);
1404 }
1405
1406 static void
emit_end_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1407 emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1408 struct tu_query_pool *pool,
1409 uint32_t query)
1410 {
1411 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1412
1413 if (!cmdbuf->state.pass) {
1414 cmdbuf->state.prim_generated_query_running_before_rp = false;
1415 }
1416
1417 uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1418 uint64_t end_iova = primitives_generated_query_iova(pool, query, end);
1419 uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
1420 uint64_t available_iova = query_available_iova(pool, query);
1421
1422 if (cmdbuf->state.pass) {
1423 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1424 CP_COND_REG_EXEC_0_SYSMEM |
1425 CP_COND_REG_EXEC_0_BINNING);
1426 }
1427
1428 tu_cs_emit_wfi(cs);
1429
1430 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1431 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1432 CP_REG_TO_MEM_0_CNT(2) |
1433 CP_REG_TO_MEM_0_64B);
1434 tu_cs_emit_qw(cs, end_iova);
1435
1436 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1437 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1438 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
1439 tu_cs_emit_qw(cs, result_iova);
1440 tu_cs_emit_qw(cs, result_iova);
1441 tu_cs_emit_qw(cs, end_iova);
1442 tu_cs_emit_qw(cs, begin_iova);
1443
1444 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1445
1446 /* Should be after waiting for mem writes to have up to date info
1447 * about which query is running.
1448 */
1449 emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1450
1451 if (cmdbuf->state.pass) {
1452 tu_cond_exec_end(cs);
1453 }
1454
1455 if (cmdbuf->state.pass)
1456 cs = &cmdbuf->draw_epilogue_cs;
1457
1458 /* Set the availability to 1 */
1459 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1460 tu_cs_emit_qw(cs, available_iova);
1461 tu_cs_emit_qw(cs, 0x1);
1462 }
1463
1464 /* Implement this bit of spec text from section 17.2 "Query Operation":
1465 *
1466 * If queries are used while executing a render pass instance that has
1467 * multiview enabled, the query uses N consecutive query indices in the
1468 * query pool (starting at query) where N is the number of bits set in the
1469 * view mask in the subpass the query is used in. How the numerical
1470 * results of the query are distributed among the queries is
1471 * implementation-dependent. For example, some implementations may write
1472 * each view’s results to a distinct query, while other implementations
1473 * may write the total result to the first query and write zero to the
1474 * other queries. However, the sum of the results in all the queries must
1475 * accurately reflect the total result of the query summed over all views.
1476 * Applications can sum the results from all the queries to compute the
1477 * total result.
1478 *
1479 * Since we execute all views at once, we write zero to the other queries.
1480 * Furthermore, because queries must be reset before use, and we set the
1481 * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
1482 */
1483
1484 static void
handle_multiview_queries(struct tu_cmd_buffer * cmd,struct tu_query_pool * pool,uint32_t query)1485 handle_multiview_queries(struct tu_cmd_buffer *cmd,
1486 struct tu_query_pool *pool,
1487 uint32_t query)
1488 {
1489 if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
1490 return;
1491
1492 unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
1493 struct tu_cs *cs = &cmd->draw_epilogue_cs;
1494
1495 for (uint32_t i = 1; i < views; i++) {
1496 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1497 tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
1498 tu_cs_emit_qw(cs, 0x1);
1499 }
1500 }
1501
1502 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1503 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
1504 VkQueryPool queryPool,
1505 uint32_t query)
1506 {
1507 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1508 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1509 assert(query < pool->size);
1510
1511 switch (pool->type) {
1512 case VK_QUERY_TYPE_OCCLUSION:
1513 emit_end_occlusion_query(cmdbuf, pool, query);
1514 break;
1515 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1516 emit_end_xfb_query(cmdbuf, pool, query, 0);
1517 break;
1518 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1519 emit_end_prim_generated_query(cmdbuf, pool, query);
1520 break;
1521 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1522 emit_end_perf_query(cmdbuf, pool, query);
1523 break;
1524 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1525 emit_end_stat_query(cmdbuf, pool, query);
1526 break;
1527 case VK_QUERY_TYPE_TIMESTAMP:
1528 unreachable("Unimplemented query type");
1529 default:
1530 assert(!"Invalid query type");
1531 }
1532
1533 handle_multiview_queries(cmdbuf, pool, query);
1534 }
1535
1536 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1537 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
1538 VkQueryPool queryPool,
1539 uint32_t query,
1540 uint32_t index)
1541 {
1542 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1543 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1544 assert(query < pool->size);
1545
1546 switch (pool->type) {
1547 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1548 assert(index <= 4);
1549 emit_end_xfb_query(cmdbuf, pool, query, index);
1550 break;
1551 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1552 emit_end_prim_generated_query(cmdbuf, pool, query);
1553 break;
1554 default:
1555 assert(!"Invalid query type");
1556 }
1557 }
1558
1559 VKAPI_ATTR void VKAPI_CALL
tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits2 pipelineStage,VkQueryPool queryPool,uint32_t query)1560 tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
1561 VkPipelineStageFlagBits2 pipelineStage,
1562 VkQueryPool queryPool,
1563 uint32_t query)
1564 {
1565 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1566 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1567
1568 /* Inside a render pass, just write the timestamp multiple times so that
1569 * the user gets the last one if we use GMEM. There isn't really much
1570 * better we can do, and this seems to be what the blob does too.
1571 */
1572 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
1573
1574 /* Stages that will already have been executed by the time the CP executes
1575 * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1576 * indirect stage counts as top-of-pipe too.
1577 */
1578 VkPipelineStageFlags2 top_of_pipe_flags =
1579 VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1580 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
1581
1582 if (pipelineStage & ~top_of_pipe_flags) {
1583 /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1584 * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1585 * complete.
1586 *
1587 * Stalling the CP like this is really unfortunate, but I don't think
1588 * there's a better solution that allows all 48 bits of precision
1589 * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1590 */
1591 tu_cs_emit_wfi(cs);
1592 }
1593
1594 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1595 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) |
1596 CP_REG_TO_MEM_0_CNT(2) |
1597 CP_REG_TO_MEM_0_64B);
1598 tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1599
1600 /* Only flag availability once the entire renderpass is done, similar to
1601 * the begin/end path.
1602 */
1603 cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
1604
1605 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1606 tu_cs_emit_qw(cs, query_available_iova(pool, query));
1607 tu_cs_emit_qw(cs, 0x1);
1608
1609 /* From the spec for vkCmdWriteTimestamp:
1610 *
1611 * If vkCmdWriteTimestamp is called while executing a render pass
1612 * instance that has multiview enabled, the timestamp uses N consecutive
1613 * query indices in the query pool (starting at query) where N is the
1614 * number of bits set in the view mask of the subpass the command is
1615 * executed in. The resulting query values are determined by an
1616 * implementation-dependent choice of one of the following behaviors:
1617 *
1618 * - The first query is a timestamp value and (if more than one bit is
1619 * set in the view mask) zero is written to the remaining queries.
1620 * If two timestamps are written in the same subpass, the sum of the
1621 * execution time of all views between those commands is the
1622 * difference between the first query written by each command.
1623 *
1624 * - All N queries are timestamp values. If two timestamps are written
1625 * in the same subpass, the sum of the execution time of all views
1626 * between those commands is the sum of the difference between
1627 * corresponding queries written by each command. The difference
1628 * between corresponding queries may be the execution time of a
1629 * single view.
1630 *
1631 * We execute all views in the same draw call, so we implement the first
1632 * option, the same as regular queries.
1633 */
1634 handle_multiview_queries(cmd, pool, query);
1635 }
1636
1637 VKAPI_ATTR VkResult VKAPI_CALL
tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1638 tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1639 VkPhysicalDevice physicalDevice,
1640 uint32_t queueFamilyIndex,
1641 uint32_t* pCounterCount,
1642 VkPerformanceCounterKHR* pCounters,
1643 VkPerformanceCounterDescriptionKHR* pCounterDescriptions)
1644 {
1645 TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1646
1647 uint32_t desc_count = *pCounterCount;
1648 uint32_t group_count;
1649 const struct fd_perfcntr_group *group =
1650 fd_perfcntrs(&phydev->dev_id, &group_count);
1651
1652 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
1653 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
1654 pCounterDescriptions, &desc_count);
1655
1656 for (int i = 0; i < group_count; i++) {
1657 for (int j = 0; j < group[i].num_countables; j++) {
1658
1659 vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1660 counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
1661 counter->unit =
1662 fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
1663 counter->storage =
1664 fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
1665
1666 unsigned char sha1_result[20];
1667 _mesa_sha1_compute(group[i].countables[j].name,
1668 strlen(group[i].countables[j].name),
1669 sha1_result);
1670 memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1671 }
1672
1673 vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
1674 desc->flags = 0;
1675
1676 snprintf(desc->name, sizeof(desc->name),
1677 "%s", group[i].countables[j].name);
1678 snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
1679 snprintf(desc->description, sizeof(desc->description),
1680 "%s: %s performance counter",
1681 group[i].name, group[i].countables[j].name);
1682 }
1683 }
1684 }
1685
1686 return vk_outarray_status(&out);
1687 }
1688
1689 VKAPI_ATTR void VKAPI_CALL
tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1690 tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1691 VkPhysicalDevice physicalDevice,
1692 const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo,
1693 uint32_t* pNumPasses)
1694 {
1695 TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1696 uint32_t group_count = 0;
1697 uint32_t gid = 0, cid = 0, n_passes;
1698 const struct fd_perfcntr_group *group =
1699 fd_perfcntrs(&phydev->dev_id, &group_count);
1700
1701 uint32_t counters_requested[group_count];
1702 memset(counters_requested, 0x0, sizeof(counters_requested));
1703 *pNumPasses = 1;
1704
1705 for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
1706 perfcntr_index(group, group_count,
1707 pPerformanceQueryCreateInfo->pCounterIndices[i],
1708 &gid, &cid);
1709
1710 counters_requested[gid]++;
1711 }
1712
1713 for (uint32_t i = 0; i < group_count; i++) {
1714 n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
1715 *pNumPasses = MAX2(*pNumPasses, n_passes);
1716 }
1717 }
1718
1719 VKAPI_ATTR VkResult VKAPI_CALL
tu_AcquireProfilingLockKHR(VkDevice device,const VkAcquireProfilingLockInfoKHR * pInfo)1720 tu_AcquireProfilingLockKHR(VkDevice device,
1721 const VkAcquireProfilingLockInfoKHR* pInfo)
1722 {
1723 /* TODO. Probably there's something to do for kgsl. */
1724 return VK_SUCCESS;
1725 }
1726
1727 VKAPI_ATTR void VKAPI_CALL
tu_ReleaseProfilingLockKHR(VkDevice device)1728 tu_ReleaseProfilingLockKHR(VkDevice device)
1729 {
1730 /* TODO. Probably there's something to do for kgsl. */
1731 return;
1732 }
1733