// Copyright 2016-2021 The Khronos Group, Inc. // // SPDX-License-Identifier: CC-BY-4.0 include::{generated}/meta/{refprefix}VK_KHR_performance_query.txt[] === Other Extension Metadata *Last Modified Date*:: 2019-10-08 *IP Status*:: No known IP claims. *Contributors*:: - Jesse Barker, Unity Technologies - Kenneth Benzie, Codeplay - Jan-Harald Fredriksen, ARM - Jeff Leger, Qualcomm - Jesse Hall, Google - Tobias Hector, AMD - Neil Henning, Codeplay - Baldur Karlsson - Lionel Landwerlin, Intel - Peter Lohrmann, AMD - Alon Or-bach, Samsung - Daniel Rakos, AMD - Niklas Smedberg, Unity Technologies - Igor Ostrowski, Intel === Description The `VK_KHR_performance_query` extension adds a mechanism to allow querying of performance counters for use in applications and by profiling tools. Each queue family may: expose counters that can: be enabled on a queue of that family. We extend elink:VkQueryType to add a new query type for performance queries, and chain a structure on slink:VkQueryPoolCreateInfo to specify the performance queries to enable. include::{generated}/interfaces/VK_KHR_performance_query.txt[] === Issues 1) Should this extension include a mechanism to begin a query in command buffer _A_ and end the query in command buffer _B_? *RESOLVED* No - queries are tied to command buffer creation and thus have to be encapsulated within a single command buffer. 2) Should this extension include a mechanism to begin and end queries globally on the queue, not using the existing command buffer commands? *RESOLVED* No - for the same reasoning as the resolution of 1). 3) Should this extension expose counters that require multiple passes? *RESOLVED* Yes - users should re-submit a command buffer with the same commands in it multiple times, specifying the pass to count as the query parameter in VkPerformanceQuerySubmitInfoKHR. 4) How to handle counters across parallel workloads? *RESOLVED* In the spirit of Vulkan, a counter description flag ename:VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR denotes that the accuracy of a counter result is affected by parallel workloads. 5) How to handle secondary command buffers? *RESOLVED* Secondary command buffers inherit any counter pass index specified in the parent primary command buffer. Note: this is no longer an issue after change from issue 10 resolution 6) What commands does the profiling lock have to be held for? *RESOLVED* For any command buffer that is being queried with a performance query pool, the profiling lock must: be held while that command buffer is in the _recording_, _executable_, or _pending state_. 7) Should we support flink:vkCmdCopyQueryPoolResults? *RESOLVED* Yes. 8) Should we allow performance queries to interact with multiview? *RESOLVED* Yes, but the performance queries must be performed once for each pass per view. 9) Should a `queryCount > 1` be usable for performance queries? *RESOLVED* Yes. Some vendors will have costly performance counter query pool creation, and would rather if a certain set of counters were to be used multiple times that a `queryCount > 1` can be used to amortize the instantiation cost. 10) Should we introduce an indirect mechanism to set the counter pass index? *RESOLVED* Specify the counter pass index at submit time instead, to avoid requiring re-recording of command buffers when multiple counter passes are needed. === Examples The following example shows how to find what performance counters a queue family supports, setup a query pool to record these performance counters, how to add the query pool to the command buffer to record information, and how to get the results from the query pool. [source,c++] -------------------------------------- // A previously created physical device VkPhysicalDevice physicalDevice; // One of the queue families our device supports uint32_t queueFamilyIndex; uint32_t counterCount; // Get the count of counters supported vkEnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( physicalDevice, queueFamilyIndex, &counterCount, NULL, NULL); VkPerformanceCounterKHR* counters = malloc(sizeof(VkPerformanceCounterKHR) * counterCount); VkPerformanceCounterDescriptionKHR* counterDescriptions = malloc(sizeof(VkPerformanceCounterDescriptionKHR) * counterCount); // Get the counters supported vkEnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( physicalDevice, queueFamilyIndex, &counterCount, counters, counterDescriptions); // Try to enable the first 8 counters uint32_t enabledCounters[8]; const uint32_t enabledCounterCount = min(counterCount, 8)); for (uint32_t i = 0; i < enabledCounterCount; i++) { enabledCounters[i] = i; } // A previously created device that had the performanceCounterQueryPools feature // set to VK_TRUE VkDevice device; VkQueryPoolPerformanceCreateInfoKHR performanceQueryCreateInfo = { VK_STRUCTURE_TYPE_QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR, NULL, // Specify the queue family that this performance query is performed on queueFamilyIndex, // The number of counters to enable enabledCounterCount, // The array of indices of counters to enable enabledCounters }; // Get the number of passes our counters will require. uint32_t numPasses; vkGetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( physicalDevice, &performanceQueryCreateInfo, &numPasses); VkQueryPoolCreateInfo queryPoolCreateInfo = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, &performanceQueryCreateInfo, 0, // Using our new query type here VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR, 1, 0 }; VkQueryPool queryPool; VkResult result = vkCreateQueryPool( device, &queryPoolCreateInfo, NULL, &queryPool); assert(VK_SUCCESS == result); // A queue from queueFamilyIndex VkQueue queue; // A command buffer we want to record counters on VkCommandBuffer commandBuffer; VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, NULL, 0, NULL }; VkAcquireProfilingLockInfoKHR lockInfo = { VK_STRUCTURE_TYPE_ACQUIRE_PROFILING_LOCK_INFO_KHR, NULL, 0, UINT64_MAX // Wait forever for the lock }; // Acquire the profiling lock before we record command buffers // that will use performance queries result = vkAcquireProfilingLockKHR(device, &lockInfo); assert(VK_SUCCESS == result); result = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); assert(VK_SUCCESS == result); vkCmdResetQueryPool( commandBuffer, queryPool, 0, 1); vkCmdBeginQuery( commandBuffer, queryPool, 0, 0); // Perform the commands you want to get performance information on // ... // Perform a barrier to ensure all previous commands were complete before // ending the query vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, NULL, 0, NULL, 0, NULL); vkCmdEndQuery( commandBuffer, queryPool, 0); result = vkEndCommandBuffer(commandBuffer); assert(VK_SUCCESS == result); for (uint32_t counterPass = 0; counterPass < numPasses; counterPass++) { VkPerformanceQuerySubmitInfoKHR performanceQuerySubmitInfo = { VK_STRUCTURE_TYPE_PERFORMANCE_QUERY_SUBMIT_INFO_KHR, NULL, counterPass }; // Submit the command buffer and wait for its completion // ... } // Release the profiling lock after the command buffer is no longer in the // pending state. vkReleaseProfilingLockKHR(device); result = vkResetCommandBuffer(commandBuffer, 0); assert(VK_SUCCESS == result); // Create an array to hold the results of all counters VkPerformanceCounterResultKHR* recordedCounters = malloc( sizeof(VkPerformanceCounterResultKHR) * enabledCounterCount); result = vkGetQueryPoolResults( device, queryPool, 0, 1, sizeof(VkPerformanceCounterResultKHR) * enabledCounterCount, recordedCounters, sizeof(VkPerformanceCounterResultKHR), NULL); // recordedCounters is filled with our counters, we will look at one for posterity switch (counters[0].storage) { case VK_PERFORMANCE_COUNTER_STORAGE_INT32: // use recordCounters[0].int32 to get at the counter result! break; case VK_PERFORMANCE_COUNTER_STORAGE_INT64: // use recordCounters[0].int64 to get at the counter result! break; case VK_PERFORMANCE_COUNTER_STORAGE_UINT32: // use recordCounters[0].uint32 to get at the counter result! break; case VK_PERFORMANCE_COUNTER_STORAGE_UINT64: // use recordCounters[0].uint64 to get at the counter result! break; case VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32: // use recordCounters[0].float32 to get at the counter result! break; case VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64: // use recordCounters[0].float64 to get at the counter result! break; } -------------------------------------- === Version History * Revision 1, 2019-10-08