1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 *
5 * Based on anv:
6 * Copyright © 2015 Intel Corporation
7 */
8
9 #include "tu_query.h"
10
11 #include <fcntl.h>
12
13 #include "nir/nir_builder.h"
14 #include "util/os_time.h"
15
16 #include "vk_util.h"
17
18 #include "tu_cmd_buffer.h"
19 #include "tu_cs.h"
20 #include "tu_device.h"
21
22 #include "common/freedreno_gpu_event.h"
23
24 #define NSEC_PER_SEC 1000000000ull
25 #define WAIT_TIMEOUT 5
26 #define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
27
28 struct PACKED query_slot {
29 uint64_t available;
30 };
31
32 struct PACKED occlusion_slot_value {
33 /* Seems sample counters are placed to be 16-byte aligned
34 * even though this query needs an 8-byte slot. */
35 uint64_t value;
36 uint64_t _padding;
37 };
38
39 struct PACKED occlusion_query_slot {
40 struct query_slot common;
41 uint64_t result;
42
43 struct occlusion_slot_value begin;
44 struct occlusion_slot_value end;
45 };
46
47 struct PACKED timestamp_query_slot {
48 struct query_slot common;
49 uint64_t result;
50 };
51
52 struct PACKED primitive_slot_value {
53 uint64_t values[2];
54 };
55
56 struct PACKED pipeline_stat_query_slot {
57 struct query_slot common;
58 uint64_t results[STAT_COUNT];
59
60 uint64_t begin[STAT_COUNT];
61 uint64_t end[STAT_COUNT];
62 };
63
64 struct PACKED primitive_query_slot {
65 struct query_slot common;
66 /* The result of transform feedback queries is two integer values:
67 * results[0] is the count of primitives written,
68 * results[1] is the count of primitives generated.
69 * Also a result for each stream is stored at 4 slots respectively.
70 */
71 uint64_t results[2];
72
73 /* Primitive counters also need to be 16-byte aligned. */
74 uint64_t _padding;
75
76 struct primitive_slot_value begin[4];
77 struct primitive_slot_value end[4];
78 };
79
80 struct PACKED perfcntr_query_slot {
81 uint64_t result;
82 uint64_t begin;
83 uint64_t end;
84 };
85
86 struct PACKED perf_query_slot {
87 struct query_slot common;
88 struct perfcntr_query_slot perfcntr;
89 };
90
91 struct PACKED primitives_generated_query_slot {
92 struct query_slot common;
93 uint64_t result;
94 uint64_t begin;
95 uint64_t end;
96 };
97
98 /* Returns the IOVA of a given uint64_t field in a given slot of a query
99 * pool. */
100 #define query_iova(type, pool, query, field) \
101 pool->bo->iova + pool->stride * (query) + offsetof(type, field)
102
103 #define occlusion_query_iova(pool, query, field) \
104 query_iova(struct occlusion_query_slot, pool, query, field)
105
106 #define pipeline_stat_query_iova(pool, query, field, idx) \
107 pool->bo->iova + pool->stride * (query) + \
108 offsetof_arr(struct pipeline_stat_query_slot, field, (idx))
109
110 #define primitive_query_iova(pool, query, field, stream_id, i) \
111 query_iova(struct primitive_query_slot, pool, query, field) + \
112 sizeof_field(struct primitive_query_slot, field[0]) * (stream_id) + \
113 offsetof_arr(struct primitive_slot_value, values, (i))
114
115 #define perf_query_iova(pool, query, field, i) \
116 pool->bo->iova + pool->stride * (query) + \
117 sizeof(struct query_slot) + \
118 sizeof(struct perfcntr_query_slot) * (i) + \
119 offsetof(struct perfcntr_query_slot, field)
120
121 #define primitives_generated_query_iova(pool, query, field) \
122 query_iova(struct primitives_generated_query_slot, pool, query, field)
123
124 #define query_available_iova(pool, query) \
125 query_iova(struct query_slot, pool, query, available)
126
127 #define query_result_iova(pool, query, type, i) \
128 pool->bo->iova + pool->stride * (query) + \
129 sizeof(struct query_slot) + sizeof(type) * (i)
130
131 #define query_result_addr(pool, query, type, i) \
132 (uint64_t *) ((char *) pool->bo->map + pool->stride * (query) + \
133 sizeof(struct query_slot) + sizeof(type) * (i))
134
135 #define query_is_available(slot) slot->available
136
137 static const VkPerformanceCounterUnitKHR
138 fd_perfcntr_type_to_vk_unit[] = {
139 [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
140 [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
141 [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
142 [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
143 [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
144 /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
145 [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
146 [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
147 [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
148 [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
149 [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
150 [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
151 [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
152 };
153
154 /* TODO. Basically this comes from the freedreno implementation where
155 * only UINT64 is used. We'd better confirm this by the blob vulkan driver
156 * when it starts supporting perf query.
157 */
158 static const VkPerformanceCounterStorageKHR
159 fd_perfcntr_type_to_vk_storage[] = {
160 [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
161 [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
162 [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
163 [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
164 [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
165 [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
166 [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
167 [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
168 [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
169 [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
170 [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
171 [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
172 };
173
174 /*
175 * Returns a pointer to a given slot in a query pool.
176 */
177 static struct query_slot *
slot_address(struct tu_query_pool * pool,uint32_t query)178 slot_address(struct tu_query_pool *pool, uint32_t query)
179 {
180 return (struct query_slot *) ((char *) pool->bo->map +
181 query * pool->stride);
182 }
183
184 static void
perfcntr_index(const struct fd_perfcntr_group * group,uint32_t group_count,uint32_t index,uint32_t * gid,uint32_t * cid)185 perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
186 uint32_t index, uint32_t *gid, uint32_t *cid)
187
188 {
189 uint32_t i;
190
191 for (i = 0; i < group_count; i++) {
192 if (group[i].num_countables > index) {
193 *gid = i;
194 *cid = index;
195 break;
196 }
197 index -= group[i].num_countables;
198 }
199
200 assert(i < group_count);
201 }
202
203 static int
compare_perfcntr_pass(const void * a,const void * b)204 compare_perfcntr_pass(const void *a, const void *b)
205 {
206 return ((struct tu_perf_query_data *)a)->pass -
207 ((struct tu_perf_query_data *)b)->pass;
208 }
209
210 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)211 tu_CreateQueryPool(VkDevice _device,
212 const VkQueryPoolCreateInfo *pCreateInfo,
213 const VkAllocationCallbacks *pAllocator,
214 VkQueryPool *pQueryPool)
215 {
216 TU_FROM_HANDLE(tu_device, device, _device);
217 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
218 assert(pCreateInfo->queryCount > 0);
219
220 uint32_t pool_size, slot_size;
221 const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
222
223 pool_size = sizeof(struct tu_query_pool);
224
225 switch (pCreateInfo->queryType) {
226 case VK_QUERY_TYPE_OCCLUSION:
227 slot_size = sizeof(struct occlusion_query_slot);
228 break;
229 case VK_QUERY_TYPE_TIMESTAMP:
230 slot_size = sizeof(struct timestamp_query_slot);
231 break;
232 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
233 slot_size = sizeof(struct primitive_query_slot);
234 break;
235 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
236 slot_size = sizeof(struct primitives_generated_query_slot);
237 break;
238 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
239 perf_query_info =
240 vk_find_struct_const(pCreateInfo->pNext,
241 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
242 assert(perf_query_info);
243
244 slot_size = sizeof(struct perf_query_slot) +
245 sizeof(struct perfcntr_query_slot) *
246 (perf_query_info->counterIndexCount - 1);
247
248 /* Size of the array pool->tu_perf_query_data */
249 pool_size += sizeof(struct tu_perf_query_data) *
250 perf_query_info->counterIndexCount;
251 break;
252 }
253 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
254 slot_size = sizeof(struct pipeline_stat_query_slot);
255 break;
256 default:
257 unreachable("Invalid query type");
258 }
259
260 struct tu_query_pool *pool = (struct tu_query_pool *)
261 vk_object_alloc(&device->vk, pAllocator, pool_size,
262 VK_OBJECT_TYPE_QUERY_POOL);
263 if (!pool)
264 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
265
266 if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
267 pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
268 &pool->perf_group_count);
269
270 pool->counter_index_count = perf_query_info->counterIndexCount;
271
272 /* Build all perf counters data that is requested, so we could get
273 * correct group id, countable id, counter register and pass index with
274 * only a counter index provided by applications at each command submit.
275 *
276 * Also, since this built data will be sorted by pass index later, we
277 * should keep the original indices and store perfcntrs results according
278 * to them so apps can get correct results with their own indices.
279 */
280 uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
281 memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
282 memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
283
284 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
285 uint32_t gid = 0, cid = 0;
286
287 perfcntr_index(pool->perf_group, pool->perf_group_count,
288 perf_query_info->pCounterIndices[i], &gid, &cid);
289
290 pool->perf_query_data[i].gid = gid;
291 pool->perf_query_data[i].cid = cid;
292 pool->perf_query_data[i].app_idx = i;
293
294 /* When a counter register is over the capacity(num_counters),
295 * reset it for next pass.
296 */
297 if (regs[gid] < pool->perf_group[gid].num_counters) {
298 pool->perf_query_data[i].cntr_reg = regs[gid]++;
299 pool->perf_query_data[i].pass = pass[gid];
300 } else {
301 pool->perf_query_data[i].pass = ++pass[gid];
302 pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
303 regs[gid]++;
304 }
305 }
306
307 /* Sort by pass index so we could easily prepare a command stream
308 * with the ascending order of pass index.
309 */
310 qsort(pool->perf_query_data, pool->counter_index_count,
311 sizeof(pool->perf_query_data[0]),
312 compare_perfcntr_pass);
313 }
314
315 VkResult result = tu_bo_init_new(device, &pool->bo,
316 pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
317 if (result != VK_SUCCESS) {
318 vk_object_free(&device->vk, pAllocator, pool);
319 return result;
320 }
321
322 result = tu_bo_map(device, pool->bo);
323 if (result != VK_SUCCESS) {
324 tu_bo_finish(device, pool->bo);
325 vk_object_free(&device->vk, pAllocator, pool);
326 return result;
327 }
328
329 /* Initialize all query statuses to unavailable */
330 memset(pool->bo->map, 0, pool->bo->size);
331
332 pool->type = pCreateInfo->queryType;
333 pool->stride = slot_size;
334 pool->size = pCreateInfo->queryCount;
335 pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
336 *pQueryPool = tu_query_pool_to_handle(pool);
337
338 return VK_SUCCESS;
339 }
340
341 VKAPI_ATTR void VKAPI_CALL
tu_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)342 tu_DestroyQueryPool(VkDevice _device,
343 VkQueryPool _pool,
344 const VkAllocationCallbacks *pAllocator)
345 {
346 TU_FROM_HANDLE(tu_device, device, _device);
347 TU_FROM_HANDLE(tu_query_pool, pool, _pool);
348
349 if (!pool)
350 return;
351
352 tu_bo_finish(device, pool->bo);
353 vk_object_free(&device->vk, pAllocator, pool);
354 }
355
356 static uint32_t
get_result_count(struct tu_query_pool * pool)357 get_result_count(struct tu_query_pool *pool)
358 {
359 switch (pool->type) {
360 /* Occulusion and timestamp queries write one integer value */
361 case VK_QUERY_TYPE_OCCLUSION:
362 case VK_QUERY_TYPE_TIMESTAMP:
363 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
364 return 1;
365 /* Transform feedback queries write two integer values */
366 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
367 return 2;
368 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
369 return util_bitcount(pool->pipeline_statistics);
370 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
371 return pool->counter_index_count;
372 default:
373 assert(!"Invalid query type");
374 return 0;
375 }
376 }
377
378 static uint32_t
statistics_index(uint32_t * statistics)379 statistics_index(uint32_t *statistics)
380 {
381 uint32_t stat;
382 stat = u_bit_scan(statistics);
383
384 switch (1 << stat) {
385 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
386 case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
387 return 0;
388 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
389 return 1;
390 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
391 return 2;
392 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
393 return 4;
394 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
395 return 5;
396 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
397 return 6;
398 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
399 return 7;
400 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
401 return 8;
402 case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
403 return 9;
404 case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
405 return 10;
406 default:
407 return 0;
408 }
409 }
410
411 static bool
is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)412 is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
413 {
414 return pipeline_statistics &
415 (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
416 VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
417 VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
418 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
419 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
420 VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
421 VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
422 VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
423 VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT);
424 }
425
426 static bool
is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)427 is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
428 {
429 return pipeline_statistics &
430 VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
431 }
432
433 static bool
is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)434 is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
435 {
436 return pipeline_statistics &
437 VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
438 }
439
440 /* Wait on the the availability status of a query up until a timeout. */
441 static VkResult
wait_for_available(struct tu_device * device,struct tu_query_pool * pool,uint32_t query)442 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
443 uint32_t query)
444 {
445 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
446 * scheduler friendly way instead of busy polling once the patch has landed
447 * upstream. */
448 struct query_slot *slot = slot_address(pool, query);
449 uint64_t abs_timeout = os_time_get_absolute_timeout(
450 WAIT_TIMEOUT * NSEC_PER_SEC);
451 while(os_time_get_nano() < abs_timeout) {
452 if (query_is_available(slot))
453 return VK_SUCCESS;
454 }
455 return vk_error(device, VK_TIMEOUT);
456 }
457
458 /* Writes a query value to a buffer from the CPU. */
459 static void
write_query_value_cpu(char * base,uint32_t offset,uint64_t value,VkQueryResultFlags flags)460 write_query_value_cpu(char* base,
461 uint32_t offset,
462 uint64_t value,
463 VkQueryResultFlags flags)
464 {
465 if (flags & VK_QUERY_RESULT_64_BIT) {
466 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
467 } else {
468 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
469 }
470 }
471
472 static VkResult
get_query_pool_results(struct tu_device * device,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)473 get_query_pool_results(struct tu_device *device,
474 struct tu_query_pool *pool,
475 uint32_t firstQuery,
476 uint32_t queryCount,
477 size_t dataSize,
478 void *pData,
479 VkDeviceSize stride,
480 VkQueryResultFlags flags)
481 {
482 assert(dataSize >= stride * queryCount);
483
484 char *result_base = (char *) pData;
485 VkResult result = VK_SUCCESS;
486 for (uint32_t i = 0; i < queryCount; i++) {
487 uint32_t query = firstQuery + i;
488 struct query_slot *slot = slot_address(pool, query);
489 bool available = query_is_available(slot);
490 uint32_t result_count = get_result_count(pool);
491 uint32_t statistics = pool->pipeline_statistics;
492
493 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
494 VkResult wait_result = wait_for_available(device, pool, query);
495 if (wait_result != VK_SUCCESS)
496 return wait_result;
497 available = true;
498 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
499 /* From the Vulkan 1.1.130 spec:
500 *
501 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
502 * both not set then no result values are written to pData for
503 * queries that are in the unavailable state at the time of the
504 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
505 * availability state is still written to pData for those queries
506 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
507 */
508 result = VK_NOT_READY;
509 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
510 result_base += stride;
511 continue;
512 }
513 }
514
515 for (uint32_t k = 0; k < result_count; k++) {
516 if (available) {
517 uint64_t *result;
518
519 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
520 uint32_t stat_idx = statistics_index(&statistics);
521 result = query_result_addr(pool, query, uint64_t, stat_idx);
522 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
523 result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
524 } else {
525 result = query_result_addr(pool, query, uint64_t, k);
526 }
527
528 write_query_value_cpu(result_base, k, *result, flags);
529 } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
530 /* From the Vulkan 1.1.130 spec:
531 *
532 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
533 * is not set, and the query’s status is unavailable, an
534 * intermediate result value between zero and the final result
535 * value is written to pData for that query.
536 *
537 * Just return 0 here for simplicity since it's a valid result.
538 */
539 write_query_value_cpu(result_base, k, 0, flags);
540 }
541
542 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
543 /* From the Vulkan 1.1.130 spec:
544 *
545 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
546 * integer value written for each query is non-zero if the query’s
547 * status was available or zero if the status was unavailable.
548 */
549 write_query_value_cpu(result_base, result_count, available, flags);
550
551 result_base += stride;
552 }
553 return result;
554 }
555
556 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)557 tu_GetQueryPoolResults(VkDevice _device,
558 VkQueryPool queryPool,
559 uint32_t firstQuery,
560 uint32_t queryCount,
561 size_t dataSize,
562 void *pData,
563 VkDeviceSize stride,
564 VkQueryResultFlags flags)
565 {
566 TU_FROM_HANDLE(tu_device, device, _device);
567 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
568 assert(firstQuery + queryCount <= pool->size);
569
570 if (vk_device_is_lost(&device->vk))
571 return VK_ERROR_DEVICE_LOST;
572
573 switch (pool->type) {
574 case VK_QUERY_TYPE_OCCLUSION:
575 case VK_QUERY_TYPE_TIMESTAMP:
576 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
577 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
578 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
579 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
580 return get_query_pool_results(device, pool, firstQuery, queryCount,
581 dataSize, pData, stride, flags);
582 default:
583 assert(!"Invalid query type");
584 }
585 return VK_SUCCESS;
586 }
587
588 /* Copies a query value from one buffer to another from the GPU. */
589 static void
copy_query_value_gpu(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,uint64_t src_iova,uint64_t base_write_iova,uint32_t offset,VkQueryResultFlags flags)590 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
591 struct tu_cs *cs,
592 uint64_t src_iova,
593 uint64_t base_write_iova,
594 uint32_t offset,
595 VkQueryResultFlags flags) {
596 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
597 sizeof(uint64_t) : sizeof(uint32_t);
598 uint64_t write_iova = base_write_iova + (offset * element_size);
599
600 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
601 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
602 CP_MEM_TO_MEM_0_DOUBLE : 0;
603 tu_cs_emit(cs, mem_to_mem_flags);
604 tu_cs_emit_qw(cs, write_iova);
605 tu_cs_emit_qw(cs, src_iova);
606 }
607
608 template <chip CHIP>
609 static void
emit_copy_query_pool_results(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount,struct tu_buffer * buffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)610 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
611 struct tu_cs *cs,
612 struct tu_query_pool *pool,
613 uint32_t firstQuery,
614 uint32_t queryCount,
615 struct tu_buffer *buffer,
616 VkDeviceSize dstOffset,
617 VkDeviceSize stride,
618 VkQueryResultFlags flags)
619 {
620 /* Flush cache for the buffer to copy to. */
621 tu_emit_cache_flush<CHIP>(cmdbuf);
622
623 /* From the Vulkan 1.1.130 spec:
624 *
625 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
626 * uses of vkCmdResetQueryPool in the same queue, without any additional
627 * synchronization.
628 *
629 * To ensure that previous writes to the available bit are coherent, first
630 * wait for all writes to complete.
631 */
632 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
633
634 for (uint32_t i = 0; i < queryCount; i++) {
635 uint32_t query = firstQuery + i;
636 uint64_t available_iova = query_available_iova(pool, query);
637 uint64_t buffer_iova = buffer->iova + dstOffset + i * stride;
638 uint32_t result_count = get_result_count(pool);
639 uint32_t statistics = pool->pipeline_statistics;
640
641 /* Wait for the available bit to be set if executed with the
642 * VK_QUERY_RESULT_WAIT_BIT flag. */
643 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
644 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
645 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
646 CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
647 tu_cs_emit_qw(cs, available_iova);
648 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
649 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
650 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
651 }
652
653 for (uint32_t k = 0; k < result_count; k++) {
654 uint64_t result_iova;
655
656 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
657 uint32_t stat_idx = statistics_index(&statistics);
658 result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
659 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
660 result_iova = query_result_iova(pool, query,
661 struct perfcntr_query_slot, k);
662 } else {
663 result_iova = query_result_iova(pool, query, uint64_t, k);
664 }
665
666 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
667 /* Unconditionally copying the bo->result into the buffer here is
668 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
669 * if the query is unavailable, this will copy the correct partial
670 * value of 0.
671 */
672 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
673 k /* offset */, flags);
674 } else {
675 /* Conditionally copy bo->result into the buffer based on whether the
676 * query is available.
677 *
678 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
679 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
680 * that 0 < available < 2, aka available == 1.
681 */
682 tu_cs_reserve(cs, 7 + 6);
683 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
684 tu_cs_emit_qw(cs, available_iova);
685 tu_cs_emit_qw(cs, available_iova);
686 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
687 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
688
689 /* Start of conditional execution */
690 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
691 k /* offset */, flags);
692 /* End of conditional execution */
693 }
694 }
695
696 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
697 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
698 result_count /* offset */, flags);
699 }
700 }
701 }
702
703 template <chip CHIP>
704 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)705 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
706 VkQueryPool queryPool,
707 uint32_t firstQuery,
708 uint32_t queryCount,
709 VkBuffer dstBuffer,
710 VkDeviceSize dstOffset,
711 VkDeviceSize stride,
712 VkQueryResultFlags flags)
713 {
714 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
715 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
716 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
717 struct tu_cs *cs = &cmdbuf->cs;
718 assert(firstQuery + queryCount <= pool->size);
719
720 switch (pool->type) {
721 case VK_QUERY_TYPE_OCCLUSION:
722 case VK_QUERY_TYPE_TIMESTAMP:
723 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
724 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
725 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
726 return emit_copy_query_pool_results<CHIP>(cmdbuf, cs, pool, firstQuery,
727 queryCount, buffer, dstOffset,
728 stride, flags);
729 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
730 unreachable("allowCommandBufferQueryCopies is false");
731 default:
732 assert(!"Invalid query type");
733 }
734 }
735 TU_GENX(tu_CmdCopyQueryPoolResults);
736
737 static void
emit_reset_query_pool(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t firstQuery,uint32_t queryCount)738 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
739 struct tu_query_pool *pool,
740 uint32_t firstQuery,
741 uint32_t queryCount)
742 {
743 struct tu_cs *cs = &cmdbuf->cs;
744
745 for (uint32_t i = 0; i < queryCount; i++) {
746 uint32_t query = firstQuery + i;
747 uint32_t statistics = pool->pipeline_statistics;
748
749 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
750 tu_cs_emit_qw(cs, query_available_iova(pool, query));
751 tu_cs_emit_qw(cs, 0x0);
752
753 for (uint32_t k = 0; k < get_result_count(pool); k++) {
754 uint64_t result_iova;
755
756 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
757 uint32_t stat_idx = statistics_index(&statistics);
758 result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
759 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
760 result_iova = query_result_iova(pool, query,
761 struct perfcntr_query_slot, k);
762 } else {
763 result_iova = query_result_iova(pool, query, uint64_t, k);
764 }
765
766 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
767 tu_cs_emit_qw(cs, result_iova);
768 tu_cs_emit_qw(cs, 0x0);
769 }
770 }
771
772 }
773
774 VKAPI_ATTR void VKAPI_CALL
tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)775 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
776 VkQueryPool queryPool,
777 uint32_t firstQuery,
778 uint32_t queryCount)
779 {
780 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
781 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
782
783 switch (pool->type) {
784 case VK_QUERY_TYPE_TIMESTAMP:
785 case VK_QUERY_TYPE_OCCLUSION:
786 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
787 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
788 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
789 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
790 emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
791 break;
792 default:
793 assert(!"Invalid query type");
794 }
795 }
796
797 VKAPI_ATTR void VKAPI_CALL
tu_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)798 tu_ResetQueryPool(VkDevice device,
799 VkQueryPool queryPool,
800 uint32_t firstQuery,
801 uint32_t queryCount)
802 {
803 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
804
805 for (uint32_t i = 0; i < queryCount; i++) {
806 struct query_slot *slot = slot_address(pool, i + firstQuery);
807 slot->available = 0;
808
809 for (uint32_t k = 0; k < get_result_count(pool); k++) {
810 uint64_t *res;
811
812 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
813 res = query_result_addr(pool, i + firstQuery,
814 struct perfcntr_query_slot, k);
815 } else {
816 res = query_result_addr(pool, i + firstQuery, uint64_t, k);
817 }
818
819 *res = 0;
820 }
821 }
822 }
823
824 template <chip CHIP>
825 static void
emit_begin_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)826 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
827 struct tu_query_pool *pool,
828 uint32_t query)
829 {
830 /* From the Vulkan 1.1.130 spec:
831 *
832 * A query must begin and end inside the same subpass of a render pass
833 * instance, or must both begin and end outside of a render pass
834 * instance.
835 *
836 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
837 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
838 * query begins/ends inside the same subpass of a render pass, we need to
839 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
840 * is then run on every tile during render, so we just need to accumulate
841 * sample counts in slot->result to compute the query result.
842 */
843 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
844
845 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
846
847 tu_cs_emit_regs(cs,
848 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
849
850 if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
851 tu_cs_emit_regs(cs,
852 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
853 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
854 tu_cs_emit(cs, ZPASS_DONE);
855 if (CHIP == A7XX) {
856 /* Copied from blob's cmdstream, not sure why it is done. */
857 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
858 tu_cs_emit(cs, CCU_CLEAN_DEPTH);
859 }
860 } else {
861 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
862 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
863 .write_sample_count = true).value);
864 tu_cs_emit_qw(cs, begin_iova);
865 }
866 }
867
868 template <chip CHIP>
869 static void
emit_begin_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)870 emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
871 struct tu_query_pool *pool,
872 uint32_t query)
873 {
874 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
875 uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin, 0);
876
877 if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
878 bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
879 cmdbuf->state.prim_counters_running++;
880
881 /* Prevent starting primitive counters when it is supposed to be stopped
882 * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
883 */
884 if (need_cond_exec) {
885 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
886 CP_COND_REG_EXEC_0_SYSMEM |
887 CP_COND_REG_EXEC_0_BINNING);
888 }
889
890 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
891
892 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
893 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
894 tu_cs_emit(cs, 0);
895
896 if (need_cond_exec) {
897 tu_cond_exec_end(cs);
898 }
899 }
900
901 if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
902 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_FRAGMENT_CTRS);
903 }
904
905 if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
906 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_COMPUTE_CTRS);
907 }
908
909 tu_cs_emit_wfi(cs);
910
911 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
912 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
913 CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
914 CP_REG_TO_MEM_0_64B);
915 tu_cs_emit_qw(cs, begin_iova);
916 }
917
918 static void
emit_perfcntrs_pass_start(struct tu_cs * cs,uint32_t pass)919 emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
920 {
921 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
922 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
923 REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
924 A6XX_CP_REG_TEST_0_BIT(pass) |
925 A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME);
926 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
927 }
928
929 static void
emit_begin_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)930 emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
931 struct tu_query_pool *pool,
932 uint32_t query)
933 {
934 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
935 uint32_t last_pass = ~0;
936
937 if (cmdbuf->state.pass) {
938 cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
939 }
940
941 /* Querying perf counters happens in these steps:
942 *
943 * 0) There's a scratch reg to set a pass index for perf counters query.
944 * Prepare cmd streams to set each pass index to the reg at device
945 * creation time. See tu_CreateDevice in tu_device.c
946 * 1) Emit command streams to read all requested perf counters at all
947 * passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
948 * reads the scratch reg where pass index is set.
949 * See emit_perfcntrs_pass_start.
950 * 2) Pick the right cs setting proper pass index to the reg and prepend
951 * it to the command buffer at each submit time.
952 * See tu_QueueSubmit in tu_drm.c
953 * 3) If the pass index in the reg is true, then executes the command
954 * stream below CP_COND_REG_EXEC.
955 */
956
957 tu_cs_emit_wfi(cs);
958
959 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
960 struct tu_perf_query_data *data = &pool->perf_query_data[i];
961
962 if (last_pass != data->pass) {
963 last_pass = data->pass;
964
965 if (data->pass != 0)
966 tu_cond_exec_end(cs);
967 emit_perfcntrs_pass_start(cs, data->pass);
968 }
969
970 const struct fd_perfcntr_counter *counter =
971 &pool->perf_group[data->gid].counters[data->cntr_reg];
972 const struct fd_perfcntr_countable *countable =
973 &pool->perf_group[data->gid].countables[data->cid];
974
975 tu_cs_emit_pkt4(cs, counter->select_reg, 1);
976 tu_cs_emit(cs, countable->selector);
977 }
978 tu_cond_exec_end(cs);
979
980 last_pass = ~0;
981 tu_cs_emit_wfi(cs);
982
983 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
984 struct tu_perf_query_data *data = &pool->perf_query_data[i];
985
986 if (last_pass != data->pass) {
987 last_pass = data->pass;
988
989 if (data->pass != 0)
990 tu_cond_exec_end(cs);
991 emit_perfcntrs_pass_start(cs, data->pass);
992 }
993
994 const struct fd_perfcntr_counter *counter =
995 &pool->perf_group[data->gid].counters[data->cntr_reg];
996
997 uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
998
999 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1000 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1001 CP_REG_TO_MEM_0_64B);
1002 tu_cs_emit_qw(cs, begin_iova);
1003 }
1004 tu_cond_exec_end(cs);
1005 }
1006
1007 template <chip CHIP>
1008 static void
emit_begin_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1009 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
1010 struct tu_query_pool *pool,
1011 uint32_t query,
1012 uint32_t stream_id)
1013 {
1014 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1015 uint64_t begin_iova = primitive_query_iova(pool, query, begin, 0, 0);
1016
1017 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
1018 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1019 }
1020
1021 template <chip CHIP>
1022 static void
emit_begin_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1023 emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1024 struct tu_query_pool *pool,
1025 uint32_t query)
1026 {
1027 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1028 uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1029
1030 if (cmdbuf->state.pass) {
1031 cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
1032 } else {
1033 cmdbuf->state.prim_generated_query_running_before_rp = true;
1034 }
1035
1036 cmdbuf->state.prim_counters_running++;
1037
1038 if (cmdbuf->state.pass) {
1039 /* Primitives that passed all tests are still counted in in each
1040 * tile even with HW binning beforehand. Do not permit it.
1041 */
1042 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1043 CP_COND_REG_EXEC_0_SYSMEM |
1044 CP_COND_REG_EXEC_0_BINNING);
1045 }
1046
1047 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_START_PRIMITIVE_CTRS);
1048
1049 tu_cs_emit_wfi(cs);
1050
1051 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1052 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1053 CP_REG_TO_MEM_0_CNT(2) |
1054 CP_REG_TO_MEM_0_64B);
1055 tu_cs_emit_qw(cs, begin_iova);
1056
1057 if (cmdbuf->state.pass) {
1058 tu_cond_exec_end(cs);
1059 }
1060 }
1061
1062 template <chip CHIP>
1063 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1064 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
1065 VkQueryPool queryPool,
1066 uint32_t query,
1067 VkQueryControlFlags flags)
1068 {
1069 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1070 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1071 assert(query < pool->size);
1072
1073 switch (pool->type) {
1074 case VK_QUERY_TYPE_OCCLUSION:
1075 /* In freedreno, there is no implementation difference between
1076 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
1077 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
1078 */
1079 emit_begin_occlusion_query<CHIP>(cmdbuf, pool, query);
1080 break;
1081 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1082 emit_begin_xfb_query<CHIP>(cmdbuf, pool, query, 0);
1083 break;
1084 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1085 emit_begin_prim_generated_query<CHIP>(cmdbuf, pool, query);
1086 break;
1087 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1088 emit_begin_perf_query(cmdbuf, pool, query);
1089 break;
1090 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1091 emit_begin_stat_query<CHIP>(cmdbuf, pool, query);
1092 break;
1093 case VK_QUERY_TYPE_TIMESTAMP:
1094 unreachable("Unimplemented query type");
1095 default:
1096 assert(!"Invalid query type");
1097 }
1098 }
1099 TU_GENX(tu_CmdBeginQuery);
1100
1101 template <chip CHIP>
1102 VKAPI_ATTR void VKAPI_CALL
tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)1103 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
1104 VkQueryPool queryPool,
1105 uint32_t query,
1106 VkQueryControlFlags flags,
1107 uint32_t index)
1108 {
1109 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1110 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1111 assert(query < pool->size);
1112
1113 switch (pool->type) {
1114 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1115 emit_begin_xfb_query<CHIP>(cmdbuf, pool, query, index);
1116 break;
1117 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1118 emit_begin_prim_generated_query<CHIP>(cmdbuf, pool, query);
1119 break;
1120 default:
1121 assert(!"Invalid query type");
1122 }
1123 }
1124 TU_GENX(tu_CmdBeginQueryIndexedEXT);
1125
1126 template <chip CHIP>
1127 static void
emit_end_occlusion_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1128 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
1129 struct tu_query_pool *pool,
1130 uint32_t query)
1131 {
1132 /* Ending an occlusion query happens in a few steps:
1133 * 1) Set the slot->end to UINT64_MAX.
1134 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
1135 * write the current sample count value into slot->end.
1136 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
1137 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
1138 * 4) Accumulate the results of the query (slot->end - slot->begin) into
1139 * slot->result.
1140 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
1141 * pass, set the slot's available bit since the query is now done.
1142 * 6) If vkCmdEndQuery *is* called from within the scope of a render
1143 * pass, we cannot mark as available yet since the commands in
1144 * draw_cs are not run until vkCmdEndRenderPass.
1145 */
1146 const struct tu_render_pass *pass = cmdbuf->state.pass;
1147 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1148
1149 uint64_t available_iova = query_available_iova(pool, query);
1150 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
1151 uint64_t end_iova = occlusion_query_iova(pool, query, end);
1152 uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0);
1153 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1154 tu_cs_emit_qw(cs, end_iova);
1155 tu_cs_emit_qw(cs, 0xffffffffffffffffull);
1156
1157 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1158
1159 tu_cs_emit_regs(cs,
1160 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
1161
1162 if (!cmdbuf->device->physical_device->info->a7xx.has_event_write_sample_count) {
1163 tu_cs_emit_regs(cs,
1164 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
1165 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1166 tu_cs_emit(cs, ZPASS_DONE);
1167 if (CHIP == A7XX) {
1168 /* Copied from blob's cmdstream, not sure why it is done. */
1169 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1170 tu_cs_emit(cs, CCU_CLEAN_DEPTH);
1171 }
1172 } else {
1173 /* A7XX TODO: Calculate (end - begin) via ZPASS_DONE. */
1174 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 3);
1175 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
1176 .write_sample_count = true).value);
1177 tu_cs_emit_qw(cs, end_iova);
1178 }
1179
1180 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
1181 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
1182 CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY));
1183 tu_cs_emit_qw(cs, end_iova);
1184 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
1185 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
1186 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1187
1188 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
1189 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1190 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
1191 tu_cs_emit_qw(cs, result_iova);
1192 tu_cs_emit_qw(cs, result_iova);
1193 tu_cs_emit_qw(cs, end_iova);
1194 tu_cs_emit_qw(cs, begin_iova);
1195
1196 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1197
1198 if (pass)
1199 /* Technically, queries should be tracked per-subpass, but here we track
1200 * at the render pass level to simply the code a bit. This is safe
1201 * because the only commands that use the available bit are
1202 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
1203 * cannot be invoked from inside a render pass scope.
1204 */
1205 cs = &cmdbuf->draw_epilogue_cs;
1206
1207 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1208 tu_cs_emit_qw(cs, available_iova);
1209 tu_cs_emit_qw(cs, 0x1);
1210 }
1211
1212 /* PRIMITIVE_CTRS is used for two distinct queries:
1213 * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
1214 * - VK_QUERY_TYPE_PIPELINE_STATISTICS
1215 * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
1216 * only for outer query.
1217 *
1218 * Also, pipeline stat query could run outside of renderpass and prim gen
1219 * query inside of secondary cmd buffer - for such case we ought to track
1220 * the status of pipeline stats query.
1221 */
1222 template <chip CHIP>
1223 static void
emit_stop_primitive_ctrs(struct tu_cmd_buffer * cmdbuf,struct tu_cs * cs,enum VkQueryType query_type)1224 emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
1225 struct tu_cs *cs,
1226 enum VkQueryType query_type)
1227 {
1228 bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
1229 cmdbuf->state.prim_counters_running--;
1230 if (cmdbuf->state.prim_counters_running == 0) {
1231 bool need_cond_exec =
1232 is_secondary &&
1233 query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
1234 is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
1235
1236 if (!need_cond_exec) {
1237 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1238 } else {
1239 tu_cs_reserve(cs, 7 + 2);
1240 /* Check that pipeline stats query is not running, only then
1241 * we count stop the counter.
1242 */
1243 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
1244 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1245 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1246 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
1247 tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
1248
1249 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_PRIMITIVE_CTRS);
1250 }
1251 }
1252
1253 if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1254 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
1255 tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
1256 tu_cs_emit(cs, 1);
1257 }
1258 }
1259
1260 template <chip CHIP>
1261 static void
emit_end_stat_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1262 emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
1263 struct tu_query_pool *pool,
1264 uint32_t query)
1265 {
1266 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1267 uint64_t end_iova = pipeline_stat_query_iova(pool, query, end, 0);
1268 uint64_t available_iova = query_available_iova(pool, query);
1269 uint64_t result_iova;
1270 uint64_t stat_start_iova;
1271 uint64_t stat_stop_iova;
1272
1273 if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
1274 /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
1275 * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
1276 * renderpass, because it is already stopped.
1277 */
1278 emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
1279 }
1280
1281 if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
1282 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_FRAGMENT_CTRS);
1283 }
1284
1285 if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
1286 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_STOP_COMPUTE_CTRS);
1287 }
1288
1289 tu_cs_emit_wfi(cs);
1290
1291 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1292 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
1293 CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
1294 CP_REG_TO_MEM_0_64B);
1295 tu_cs_emit_qw(cs, end_iova);
1296
1297 for (int i = 0; i < STAT_COUNT; i++) {
1298 result_iova = query_result_iova(pool, query, uint64_t, i);
1299 stat_start_iova = pipeline_stat_query_iova(pool, query, begin, i);
1300 stat_stop_iova = pipeline_stat_query_iova(pool, query, end, i);
1301
1302 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1303 tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1304 CP_MEM_TO_MEM_0_DOUBLE |
1305 CP_MEM_TO_MEM_0_NEG_C);
1306
1307 tu_cs_emit_qw(cs, result_iova);
1308 tu_cs_emit_qw(cs, result_iova);
1309 tu_cs_emit_qw(cs, stat_stop_iova);
1310 tu_cs_emit_qw(cs, stat_start_iova);
1311 }
1312
1313 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1314
1315 if (cmdbuf->state.pass)
1316 cs = &cmdbuf->draw_epilogue_cs;
1317
1318 /* Set the availability to 1 */
1319 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1320 tu_cs_emit_qw(cs, available_iova);
1321 tu_cs_emit_qw(cs, 0x1);
1322 }
1323
1324 static void
emit_end_perf_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1325 emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
1326 struct tu_query_pool *pool,
1327 uint32_t query)
1328 {
1329 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1330 uint64_t available_iova = query_available_iova(pool, query);
1331 uint64_t end_iova;
1332 uint64_t begin_iova;
1333 uint64_t result_iova;
1334 uint32_t last_pass = ~0;
1335
1336 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1337 struct tu_perf_query_data *data = &pool->perf_query_data[i];
1338
1339 if (last_pass != data->pass) {
1340 last_pass = data->pass;
1341
1342 if (data->pass != 0)
1343 tu_cond_exec_end(cs);
1344 emit_perfcntrs_pass_start(cs, data->pass);
1345 }
1346
1347 const struct fd_perfcntr_counter *counter =
1348 &pool->perf_group[data->gid].counters[data->cntr_reg];
1349
1350 end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1351
1352 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1353 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
1354 CP_REG_TO_MEM_0_64B);
1355 tu_cs_emit_qw(cs, end_iova);
1356 }
1357 tu_cond_exec_end(cs);
1358
1359 last_pass = ~0;
1360 tu_cs_emit_wfi(cs);
1361
1362 for (uint32_t i = 0; i < pool->counter_index_count; i++) {
1363 struct tu_perf_query_data *data = &pool->perf_query_data[i];
1364
1365 if (last_pass != data->pass) {
1366 last_pass = data->pass;
1367
1368
1369 if (data->pass != 0)
1370 tu_cond_exec_end(cs);
1371 emit_perfcntrs_pass_start(cs, data->pass);
1372 }
1373
1374 result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
1375 data->app_idx);
1376 begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
1377 end_iova = perf_query_iova(pool, 0, end, data->app_idx);
1378
1379 /* result += end - begin */
1380 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1381 tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
1382 CP_MEM_TO_MEM_0_DOUBLE |
1383 CP_MEM_TO_MEM_0_NEG_C);
1384
1385 tu_cs_emit_qw(cs, result_iova);
1386 tu_cs_emit_qw(cs, result_iova);
1387 tu_cs_emit_qw(cs, end_iova);
1388 tu_cs_emit_qw(cs, begin_iova);
1389 }
1390 tu_cond_exec_end(cs);
1391
1392 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1393
1394 if (cmdbuf->state.pass)
1395 cs = &cmdbuf->draw_epilogue_cs;
1396
1397 /* Set the availability to 1 */
1398 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1399 tu_cs_emit_qw(cs, available_iova);
1400 tu_cs_emit_qw(cs, 0x1);
1401 }
1402
1403 template <chip CHIP>
1404 static void
emit_end_xfb_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query,uint32_t stream_id)1405 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
1406 struct tu_query_pool *pool,
1407 uint32_t query,
1408 uint32_t stream_id)
1409 {
1410 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1411
1412 uint64_t end_iova = primitive_query_iova(pool, query, end, 0, 0);
1413 uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
1414 uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
1415 uint64_t begin_written_iova = primitive_query_iova(pool, query, begin, stream_id, 0);
1416 uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin, stream_id, 1);
1417 uint64_t end_written_iova = primitive_query_iova(pool, query, end, stream_id, 0);
1418 uint64_t end_generated_iova = primitive_query_iova(pool, query, end, stream_id, 1);
1419 uint64_t available_iova = query_available_iova(pool, query);
1420
1421 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
1422 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_WRITE_PRIMITIVE_COUNTS);
1423
1424 tu_cs_emit_wfi(cs);
1425 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_FLUSH);
1426
1427 /* Set the count of written primitives */
1428 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1429 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1430 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1431 tu_cs_emit_qw(cs, result_written_iova);
1432 tu_cs_emit_qw(cs, result_written_iova);
1433 tu_cs_emit_qw(cs, end_written_iova);
1434 tu_cs_emit_qw(cs, begin_written_iova);
1435
1436 tu_emit_event_write<CHIP>(cmdbuf, cs, FD_CACHE_FLUSH);
1437
1438 /* Set the count of generated primitives */
1439 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1440 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1441 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
1442 tu_cs_emit_qw(cs, result_generated_iova);
1443 tu_cs_emit_qw(cs, result_generated_iova);
1444 tu_cs_emit_qw(cs, end_generated_iova);
1445 tu_cs_emit_qw(cs, begin_generated_iova);
1446
1447 /* Set the availability to 1 */
1448 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1449 tu_cs_emit_qw(cs, available_iova);
1450 tu_cs_emit_qw(cs, 0x1);
1451 }
1452
1453 template <chip CHIP>
1454 static void
emit_end_prim_generated_query(struct tu_cmd_buffer * cmdbuf,struct tu_query_pool * pool,uint32_t query)1455 emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
1456 struct tu_query_pool *pool,
1457 uint32_t query)
1458 {
1459 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
1460
1461 if (!cmdbuf->state.pass) {
1462 cmdbuf->state.prim_generated_query_running_before_rp = false;
1463 }
1464
1465 uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
1466 uint64_t end_iova = primitives_generated_query_iova(pool, query, end);
1467 uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
1468 uint64_t available_iova = query_available_iova(pool, query);
1469
1470 if (cmdbuf->state.pass) {
1471 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1472 CP_COND_REG_EXEC_0_SYSMEM |
1473 CP_COND_REG_EXEC_0_BINNING);
1474 }
1475
1476 tu_cs_emit_wfi(cs);
1477
1478 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1479 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
1480 CP_REG_TO_MEM_0_CNT(2) |
1481 CP_REG_TO_MEM_0_64B);
1482 tu_cs_emit_qw(cs, end_iova);
1483
1484 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
1485 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
1486 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
1487 tu_cs_emit_qw(cs, result_iova);
1488 tu_cs_emit_qw(cs, result_iova);
1489 tu_cs_emit_qw(cs, end_iova);
1490 tu_cs_emit_qw(cs, begin_iova);
1491
1492 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1493
1494 /* Should be after waiting for mem writes to have up to date info
1495 * about which query is running.
1496 */
1497 emit_stop_primitive_ctrs<CHIP>(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
1498
1499 if (cmdbuf->state.pass) {
1500 tu_cond_exec_end(cs);
1501 }
1502
1503 if (cmdbuf->state.pass)
1504 cs = &cmdbuf->draw_epilogue_cs;
1505
1506 /* Set the availability to 1 */
1507 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1508 tu_cs_emit_qw(cs, available_iova);
1509 tu_cs_emit_qw(cs, 0x1);
1510 }
1511
1512 /* Implement this bit of spec text from section 17.2 "Query Operation":
1513 *
1514 * If queries are used while executing a render pass instance that has
1515 * multiview enabled, the query uses N consecutive query indices in the
1516 * query pool (starting at query) where N is the number of bits set in the
1517 * view mask in the subpass the query is used in. How the numerical
1518 * results of the query are distributed among the queries is
1519 * implementation-dependent. For example, some implementations may write
1520 * each view’s results to a distinct query, while other implementations
1521 * may write the total result to the first query and write zero to the
1522 * other queries. However, the sum of the results in all the queries must
1523 * accurately reflect the total result of the query summed over all views.
1524 * Applications can sum the results from all the queries to compute the
1525 * total result.
1526 *
1527 * Since we execute all views at once, we write zero to the other queries.
1528 * Furthermore, because queries must be reset before use, and we set the
1529 * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
1530 */
1531
1532 static void
handle_multiview_queries(struct tu_cmd_buffer * cmd,struct tu_query_pool * pool,uint32_t query)1533 handle_multiview_queries(struct tu_cmd_buffer *cmd,
1534 struct tu_query_pool *pool,
1535 uint32_t query)
1536 {
1537 if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
1538 return;
1539
1540 unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
1541 struct tu_cs *cs = &cmd->draw_epilogue_cs;
1542
1543 for (uint32_t i = 1; i < views; i++) {
1544 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1545 tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
1546 tu_cs_emit_qw(cs, 0x1);
1547 }
1548 }
1549
1550 template <chip CHIP>
1551 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1552 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
1553 VkQueryPool queryPool,
1554 uint32_t query)
1555 {
1556 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1557 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1558 assert(query < pool->size);
1559
1560 switch (pool->type) {
1561 case VK_QUERY_TYPE_OCCLUSION:
1562 emit_end_occlusion_query<CHIP>(cmdbuf, pool, query);
1563 break;
1564 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1565 emit_end_xfb_query<CHIP>(cmdbuf, pool, query, 0);
1566 break;
1567 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1568 emit_end_prim_generated_query<CHIP>(cmdbuf, pool, query);
1569 break;
1570 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1571 emit_end_perf_query(cmdbuf, pool, query);
1572 break;
1573 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1574 emit_end_stat_query<CHIP>(cmdbuf, pool, query);
1575 break;
1576 case VK_QUERY_TYPE_TIMESTAMP:
1577 unreachable("Unimplemented query type");
1578 default:
1579 assert(!"Invalid query type");
1580 }
1581
1582 handle_multiview_queries(cmdbuf, pool, query);
1583 }
1584 TU_GENX(tu_CmdEndQuery);
1585
1586 template <chip CHIP>
1587 VKAPI_ATTR void VKAPI_CALL
tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1588 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
1589 VkQueryPool queryPool,
1590 uint32_t query,
1591 uint32_t index)
1592 {
1593 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
1594 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1595 assert(query < pool->size);
1596
1597 switch (pool->type) {
1598 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1599 assert(index <= 4);
1600 emit_end_xfb_query<CHIP>(cmdbuf, pool, query, index);
1601 break;
1602 case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1603 emit_end_prim_generated_query<CHIP>(cmdbuf, pool, query);
1604 break;
1605 default:
1606 assert(!"Invalid query type");
1607 }
1608 }
1609 TU_GENX(tu_CmdEndQueryIndexedEXT);
1610
1611 VKAPI_ATTR void VKAPI_CALL
tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits2 pipelineStage,VkQueryPool queryPool,uint32_t query)1612 tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
1613 VkPipelineStageFlagBits2 pipelineStage,
1614 VkQueryPool queryPool,
1615 uint32_t query)
1616 {
1617 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1618 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
1619
1620 /* Inside a render pass, just write the timestamp multiple times so that
1621 * the user gets the last one if we use GMEM. There isn't really much
1622 * better we can do, and this seems to be what the blob does too.
1623 */
1624 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
1625
1626 /* Stages that will already have been executed by the time the CP executes
1627 * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1628 * indirect stage counts as top-of-pipe too.
1629 */
1630 VkPipelineStageFlags2 top_of_pipe_flags =
1631 VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1632 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
1633
1634 if (pipelineStage & ~top_of_pipe_flags) {
1635 /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1636 * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1637 * complete.
1638 *
1639 * Stalling the CP like this is really unfortunate, but I don't think
1640 * there's a better solution that allows all 48 bits of precision
1641 * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1642 */
1643 tu_cs_emit_wfi(cs);
1644 }
1645
1646 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
1647 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) |
1648 CP_REG_TO_MEM_0_CNT(2) |
1649 CP_REG_TO_MEM_0_64B);
1650 tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
1651
1652 /* Only flag availability once the entire renderpass is done, similar to
1653 * the begin/end path.
1654 */
1655 cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
1656
1657 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
1658 tu_cs_emit_qw(cs, query_available_iova(pool, query));
1659 tu_cs_emit_qw(cs, 0x1);
1660
1661 /* From the spec for vkCmdWriteTimestamp:
1662 *
1663 * If vkCmdWriteTimestamp is called while executing a render pass
1664 * instance that has multiview enabled, the timestamp uses N consecutive
1665 * query indices in the query pool (starting at query) where N is the
1666 * number of bits set in the view mask of the subpass the command is
1667 * executed in. The resulting query values are determined by an
1668 * implementation-dependent choice of one of the following behaviors:
1669 *
1670 * - The first query is a timestamp value and (if more than one bit is
1671 * set in the view mask) zero is written to the remaining queries.
1672 * If two timestamps are written in the same subpass, the sum of the
1673 * execution time of all views between those commands is the
1674 * difference between the first query written by each command.
1675 *
1676 * - All N queries are timestamp values. If two timestamps are written
1677 * in the same subpass, the sum of the execution time of all views
1678 * between those commands is the sum of the difference between
1679 * corresponding queries written by each command. The difference
1680 * between corresponding queries may be the execution time of a
1681 * single view.
1682 *
1683 * We execute all views in the same draw call, so we implement the first
1684 * option, the same as regular queries.
1685 */
1686 handle_multiview_queries(cmd, pool, query);
1687 }
1688
1689 VKAPI_ATTR VkResult VKAPI_CALL
tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(VkPhysicalDevice physicalDevice,uint32_t queueFamilyIndex,uint32_t * pCounterCount,VkPerformanceCounterKHR * pCounters,VkPerformanceCounterDescriptionKHR * pCounterDescriptions)1690 tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
1691 VkPhysicalDevice physicalDevice,
1692 uint32_t queueFamilyIndex,
1693 uint32_t* pCounterCount,
1694 VkPerformanceCounterKHR* pCounters,
1695 VkPerformanceCounterDescriptionKHR* pCounterDescriptions)
1696 {
1697 TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1698
1699 uint32_t desc_count = *pCounterCount;
1700 uint32_t group_count;
1701 const struct fd_perfcntr_group *group =
1702 fd_perfcntrs(&phydev->dev_id, &group_count);
1703
1704 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
1705 VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
1706 pCounterDescriptions, &desc_count);
1707
1708 for (int i = 0; i < group_count; i++) {
1709 for (int j = 0; j < group[i].num_countables; j++) {
1710
1711 vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
1712 counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
1713 counter->unit =
1714 fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
1715 counter->storage =
1716 fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
1717
1718 unsigned char sha1_result[20];
1719 _mesa_sha1_compute(group[i].countables[j].name,
1720 strlen(group[i].countables[j].name),
1721 sha1_result);
1722 memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
1723 }
1724
1725 vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
1726 desc->flags = 0;
1727
1728 snprintf(desc->name, sizeof(desc->name),
1729 "%s", group[i].countables[j].name);
1730 snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
1731 snprintf(desc->description, sizeof(desc->description),
1732 "%s: %s performance counter",
1733 group[i].name, group[i].countables[j].name);
1734 }
1735 }
1736 }
1737
1738 return vk_outarray_status(&out);
1739 }
1740
1741 VKAPI_ATTR void VKAPI_CALL
tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(VkPhysicalDevice physicalDevice,const VkQueryPoolPerformanceCreateInfoKHR * pPerformanceQueryCreateInfo,uint32_t * pNumPasses)1742 tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
1743 VkPhysicalDevice physicalDevice,
1744 const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo,
1745 uint32_t* pNumPasses)
1746 {
1747 TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
1748 uint32_t group_count = 0;
1749 uint32_t gid = 0, cid = 0, n_passes;
1750 const struct fd_perfcntr_group *group =
1751 fd_perfcntrs(&phydev->dev_id, &group_count);
1752
1753 uint32_t counters_requested[group_count];
1754 memset(counters_requested, 0x0, sizeof(counters_requested));
1755 *pNumPasses = 1;
1756
1757 for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
1758 perfcntr_index(group, group_count,
1759 pPerformanceQueryCreateInfo->pCounterIndices[i],
1760 &gid, &cid);
1761
1762 counters_requested[gid]++;
1763 }
1764
1765 for (uint32_t i = 0; i < group_count; i++) {
1766 n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
1767 *pNumPasses = MAX2(*pNumPasses, n_passes);
1768 }
1769 }
1770
1771 VKAPI_ATTR VkResult VKAPI_CALL
tu_AcquireProfilingLockKHR(VkDevice device,const VkAcquireProfilingLockInfoKHR * pInfo)1772 tu_AcquireProfilingLockKHR(VkDevice device,
1773 const VkAcquireProfilingLockInfoKHR* pInfo)
1774 {
1775 /* TODO. Probably there's something to do for kgsl. */
1776 return VK_SUCCESS;
1777 }
1778
1779 VKAPI_ATTR void VKAPI_CALL
tu_ReleaseProfilingLockKHR(VkDevice device)1780 tu_ReleaseProfilingLockKHR(VkDevice device)
1781 {
1782 /* TODO. Probably there's something to do for kgsl. */
1783 return;
1784 }
1785