1// Copyright 2016-2024 The Khronos Group Inc. 2// 3// SPDX-License-Identifier: CC-BY-4.0 4 5include::{generated}/meta/{refprefix}VK_KHR_performance_query.adoc[] 6 7=== Other Extension Metadata 8 9*Last Modified Date*:: 10 2019-10-08 11*IP Status*:: 12 No known IP claims. 13*Contributors*:: 14 - Jesse Barker, Unity Technologies 15 - Kenneth Benzie, Codeplay 16 - Jan-Harald Fredriksen, ARM 17 - Jeff Leger, Qualcomm 18 - Jesse Hall, Google 19 - Tobias Hector, AMD 20 - Neil Henning, Codeplay 21 - Baldur Karlsson 22 - Lionel Landwerlin, Intel 23 - Peter Lohrmann, AMD 24 - Alon Or-bach, Samsung 25 - Daniel Rakos, AMD 26 - Niklas Smedberg, Unity Technologies 27 - Igor Ostrowski, Intel 28 29=== Description 30 31The `VK_KHR_performance_query` extension adds a mechanism to allow querying 32of performance counters for use in applications and by profiling tools. 33 34Each queue family may: expose counters that can: be enabled on a queue of 35that family. 36We extend elink:VkQueryType to add a new query type for performance queries, 37and chain a structure on slink:VkQueryPoolCreateInfo to specify the 38performance queries to enable. 39 40include::{generated}/interfaces/VK_KHR_performance_query.adoc[] 41 42=== Issues 43 441) Should this extension include a mechanism to begin a query in command 45buffer _A_ and end the query in command buffer _B_? 46 47*RESOLVED* No - queries are tied to command buffer creation and thus have to 48be encapsulated within a single command buffer. 49 502) Should this extension include a mechanism to begin and end queries 51globally on the queue, not using the existing command buffer commands? 52 53*RESOLVED* No - for the same reasoning as the resolution of 1). 54 553) Should this extension expose counters that require multiple passes? 56 57*RESOLVED* Yes - users should re-submit a command buffer with the same 58commands in it multiple times, specifying the pass to count as the query 59parameter in VkPerformanceQuerySubmitInfoKHR. 60 614) How to handle counters across parallel workloads? 62 63*RESOLVED* In the spirit of Vulkan, a counter description flag 64ename:VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR 65denotes that the accuracy of a counter result is affected by parallel 66workloads. 67 685) How to handle secondary command buffers? 69 70*RESOLVED* Secondary command buffers inherit any counter pass index 71specified in the parent primary command buffer. 72Note: this is no longer an issue after change from issue 10 resolution 73 746) What commands does the profiling lock have to be held for? 75 76*RESOLVED* For any command buffer that is being queried with a performance 77query pool, the profiling lock must: be held while that command buffer is in 78the _recording_, _executable_, or _pending state_. 79 807) Should we support flink:vkCmdCopyQueryPoolResults? 81 82*RESOLVED* Yes. 83 848) Should we allow performance queries to interact with multiview? 85 86*RESOLVED* Yes, but the performance queries must be performed once for each 87pass per view. 88 899) Should a `queryCount > 1` be usable for performance queries? 90 91*RESOLVED* Yes. 92Some vendors will have costly performance counter query pool creation, and 93would rather if a certain set of counters were to be used multiple times 94that a `queryCount > 1` can be used to amortize the instantiation cost. 95 9610) Should we introduce an indirect mechanism to set the counter pass index? 97 98*RESOLVED* Specify the counter pass index at submit time instead, to avoid 99requiring re-recording of command buffers when multiple counter passes are 100needed. 101 102 103=== Examples 104 105The following example shows how to find what performance counters a queue 106family supports, setup a query pool to record these performance counters, 107how to add the query pool to the command buffer to record information, and 108how to get the results from the query pool. 109 110[source,c++] 111---- 112// A previously created physical device 113VkPhysicalDevice physicalDevice; 114 115// One of the queue families our device supports 116uint32_t queueFamilyIndex; 117 118uint32_t counterCount; 119 120// Get the count of counters supported 121vkEnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( 122 physicalDevice, 123 queueFamilyIndex, 124 &counterCount, 125 NULL, 126 NULL); 127 128VkPerformanceCounterKHR* counters = 129 malloc(sizeof(VkPerformanceCounterKHR) * counterCount); 130VkPerformanceCounterDescriptionKHR* counterDescriptions = 131 malloc(sizeof(VkPerformanceCounterDescriptionKHR) * counterCount); 132 133// Get the counters supported 134vkEnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( 135 physicalDevice, 136 queueFamilyIndex, 137 &counterCount, 138 counters, 139 counterDescriptions); 140 141// Try to enable the first 8 counters 142uint32_t enabledCounters[8]; 143 144const uint32_t enabledCounterCount = min(counterCount, 8)); 145 146for (uint32_t i = 0; i < enabledCounterCount; i++) { 147 enabledCounters[i] = i; 148} 149 150// A previously created device that had the performanceCounterQueryPools feature 151// set to VK_TRUE 152VkDevice device; 153 154VkQueryPoolPerformanceCreateInfoKHR performanceQueryCreateInfo = { 155 .sType = VK_STRUCTURE_TYPE_QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR, 156 .pNext = NULL, 157 158 // Specify the queue family that this performance query is performed on 159 .queueFamilyIndex = queueFamilyIndex, 160 161 // The number of counters to enable 162 .counterIndexCount = enabledCounterCount, 163 164 // The array of indices of counters to enable 165 .pCounterIndices = enabledCounters 166}; 167 168 169// Get the number of passes our counters will require. 170uint32_t numPasses; 171 172vkGetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( 173 physicalDevice, 174 &performanceQueryCreateInfo, 175 &numPasses); 176 177VkQueryPoolCreateInfo queryPoolCreateInfo = { 178 .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, 179 .pNext = &performanceQueryCreateInfo, 180 .flags = 0, 181 // Using our new query type here 182 .queryType = VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR, 183 .queryCount = 1, 184 .pipelineStatistics = 0 185}; 186 187VkQueryPool queryPool; 188 189VkResult result = vkCreateQueryPool( 190 device, 191 &queryPoolCreateInfo, 192 NULL, 193 &queryPool); 194 195assert(VK_SUCCESS == result); 196 197// A queue from queueFamilyIndex 198VkQueue queue; 199 200// A command buffer we want to record counters on 201VkCommandBuffer commandBuffer; 202 203VkCommandBufferBeginInfo commandBufferBeginInfo = { 204 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, 205 .pNext = NULL, 206 .flags = 0, 207 .pInheritanceInfo = NULL 208}; 209 210VkAcquireProfilingLockInfoKHR lockInfo = { 211 .sType = VK_STRUCTURE_TYPE_ACQUIRE_PROFILING_LOCK_INFO_KHR, 212 .pNext = NULL, 213 .flags = 0, 214 .timeout = UINT64_MAX // Wait forever for the lock 215}; 216 217// Acquire the profiling lock before we record command buffers 218// that will use performance queries 219 220result = vkAcquireProfilingLockKHR(device, &lockInfo); 221 222assert(VK_SUCCESS == result); 223 224result = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); 225 226assert(VK_SUCCESS == result); 227 228vkCmdResetQueryPool( 229 commandBuffer, 230 queryPool, 231 0, 232 1); 233 234vkCmdBeginQuery( 235 commandBuffer, 236 queryPool, 237 0, 238 0); 239 240// Perform the commands you want to get performance information on 241// ... 242 243// Perform a barrier to ensure all previous commands were complete before 244// ending the query 245vkCmdPipelineBarrier(commandBuffer, 246 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 247 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 248 0, 249 0, 250 NULL, 251 0, 252 NULL, 253 0, 254 NULL); 255 256vkCmdEndQuery( 257 commandBuffer, 258 queryPool, 259 0); 260 261result = vkEndCommandBuffer(commandBuffer); 262 263assert(VK_SUCCESS == result); 264 265for (uint32_t counterPass = 0; counterPass < numPasses; counterPass++) { 266 267 VkPerformanceQuerySubmitInfoKHR performanceQuerySubmitInfo = { 268 VK_STRUCTURE_TYPE_PERFORMANCE_QUERY_SUBMIT_INFO_KHR, 269 NULL, 270 counterPass 271 }; 272 273 274 // Submit the command buffer and wait for its completion 275 // ... 276} 277 278// Release the profiling lock after the command buffer is no longer in the 279// pending state. 280vkReleaseProfilingLockKHR(device); 281 282result = vkResetCommandBuffer(commandBuffer, 0); 283 284assert(VK_SUCCESS == result); 285 286// Create an array to hold the results of all counters 287VkPerformanceCounterResultKHR* recordedCounters = malloc( 288 sizeof(VkPerformanceCounterResultKHR) * enabledCounterCount); 289 290result = vkGetQueryPoolResults( 291 device, 292 queryPool, 293 0, 294 1, 295 sizeof(VkPerformanceCounterResultKHR) * enabledCounterCount, 296 recordedCounters, 297 sizeof(VkPerformanceCounterResultKHR) * enabledCounterCount, 298 NULL); 299 300// recordedCounters is filled with our counters, we will look at one for posterity 301switch (counters[0].storage) { 302 case VK_PERFORMANCE_COUNTER_STORAGE_INT32: 303 // use recordCounters[0].int32 to get at the counter result! 304 break; 305 case VK_PERFORMANCE_COUNTER_STORAGE_INT64: 306 // use recordCounters[0].int64 to get at the counter result! 307 break; 308 case VK_PERFORMANCE_COUNTER_STORAGE_UINT32: 309 // use recordCounters[0].uint32 to get at the counter result! 310 break; 311 case VK_PERFORMANCE_COUNTER_STORAGE_UINT64: 312 // use recordCounters[0].uint64 to get at the counter result! 313 break; 314 case VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32: 315 // use recordCounters[0].float32 to get at the counter result! 316 break; 317 case VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64: 318 // use recordCounters[0].float64 to get at the counter result! 319 break; 320} 321---- 322 323=== Version History 324 325 * Revision 1, 2019-10-08 326