• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2024 The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expresso or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "DeviceLostHelper.h"
16 
17 #include <algorithm>
18 #include <iterator>
19 #include <set>
20 
21 #include "host-common/logging.h"
22 
23 namespace gfxstream {
24 namespace vk {
25 
enableWithNvidiaDeviceDiagnosticCheckpoints()26 void DeviceLostHelper::enableWithNvidiaDeviceDiagnosticCheckpoints() { mEnabled = true; }
27 
createMarkerForCommandBuffer(const VkCommandBuffer & commandBuffer,MarkerType type)28 const void* DeviceLostHelper::createMarkerForCommandBuffer(const VkCommandBuffer& commandBuffer,
29                                                            MarkerType type) {
30     std::lock_guard<std::mutex> lock(mMarkersMutex);
31 
32     auto it = mMarkers.insert(CheckpointMarker{commandBuffer, type});
33 
34     // References and pointers to data stored in the container are only
35     // invalidated by erasing that element, even when the corresponding
36     // iterator is invalidated.
37     return reinterpret_cast<const void*>(&(*it.first));
38 }
39 
removeMarkersForCommandBuffer(const VkCommandBuffer & commandBuffer)40 void DeviceLostHelper::removeMarkersForCommandBuffer(const VkCommandBuffer& commandBuffer) {
41     std::lock_guard<std::mutex> lock(mMarkersMutex);
42     mMarkers.erase(CheckpointMarker{
43         .commandBuffer = commandBuffer,
44         .type = MarkerType::kBegin,
45     });
46     mMarkers.erase(CheckpointMarker{
47         .commandBuffer = commandBuffer,
48         .type = MarkerType::kEnd,
49     });
50 }
51 
addNeededDeviceExtensions(std::vector<const char * > * deviceExtensions)52 void DeviceLostHelper::addNeededDeviceExtensions(std::vector<const char*>* deviceExtensions) {
53     if (mEnabled) {
54         deviceExtensions->push_back(VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME);
55     }
56 }
57 
onDeviceCreated(const DeviceWithQueues & deviceInfo)58 void DeviceLostHelper::onDeviceCreated(const DeviceWithQueues& deviceInfo) {
59     if (!mEnabled) {
60         return;
61     }
62 
63     std::lock_guard<std::mutex> lock(mDevicesMutex);
64     mDevices[deviceInfo.device] = deviceInfo;
65 }
66 
onDeviceDestroyed(VkDevice device)67 void DeviceLostHelper::onDeviceDestroyed(VkDevice device) {
68     if (!mEnabled) {
69         return;
70     }
71 
72     std::lock_guard<std::mutex> lock(mDevicesMutex);
73     mDevices.erase(device);
74 }
75 
onBeginCommandBuffer(const VkCommandBuffer & commandBuffer,const VulkanDispatch * vk)76 void DeviceLostHelper::onBeginCommandBuffer(const VkCommandBuffer& commandBuffer,
77                                             const VulkanDispatch* vk) {
78     if (!mEnabled) {
79         return;
80     }
81 
82     const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kBegin);
83     vk->vkCmdSetCheckpointNV(commandBuffer, marker);
84 }
85 
onEndCommandBuffer(const VkCommandBuffer & commandBuffer,const VulkanDispatch * vk)86 void DeviceLostHelper::onEndCommandBuffer(const VkCommandBuffer& commandBuffer,
87                                           const VulkanDispatch* vk) {
88     if (!mEnabled) {
89         return;
90     }
91 
92     const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kEnd);
93     vk->vkCmdSetCheckpointNV(commandBuffer, marker);
94 }
95 
onResetCommandBuffer(const VkCommandBuffer & commandBuffer)96 void DeviceLostHelper::onResetCommandBuffer(const VkCommandBuffer& commandBuffer) {
97     if (!mEnabled) {
98         return;
99     }
100 
101     removeMarkersForCommandBuffer(commandBuffer);
102 }
103 
onFreeCommandBuffer(const VkCommandBuffer & commandBuffer)104 void DeviceLostHelper::onFreeCommandBuffer(const VkCommandBuffer& commandBuffer) {
105     if (!mEnabled) {
106         return;
107     }
108 
109     removeMarkersForCommandBuffer(commandBuffer);
110 }
111 
onDeviceLost()112 void DeviceLostHelper::onDeviceLost() {
113     if (!mEnabled) {
114         return;
115     }
116 
117     ERR("DeviceLostHelper starting lost device checks...");
118 
119     std::lock_guard<std::mutex> deviceLock(mDevicesMutex);
120 
121     for (const auto& [device, deviceWithQueues] : mDevices) {
122         const auto* deviceDispatch = deviceWithQueues.deviceDispatch;
123         if (deviceDispatch->vkDeviceWaitIdle(device) != VK_ERROR_DEVICE_LOST) {
124             continue;
125         }
126         ERR("VkDevice:%p was lost, checking for unfinished VkCommandBuffers...", device);
127 
128         struct CommandBufferOnQueue {
129             VkCommandBuffer commandBuffer = VK_NULL_HANDLE;
130             VkQueue queue = VK_NULL_HANDLE;
131         };
132         std::vector<CommandBufferOnQueue> unfinishedCommandBuffers;
133 
134         for (const QueueWithMutex& queueInfo : deviceWithQueues.queues) {
135             VkQueue queue = queueInfo.queue;
136 
137             std::vector<VkCheckpointDataNV> checkpointDatas;
138             {
139                 std::lock_guard<std::mutex> queueLock(*queueInfo.queueMutex);
140 
141                 uint32_t checkpointDataCount = 0;
142                 deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount, nullptr);
143                 if (checkpointDataCount == 0) continue;
144 
145                 checkpointDatas.resize(static_cast<size_t>(checkpointDataCount),
146                                        VkCheckpointDataNV{
147                                            .sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV,
148                                        });
149                 deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount,
150                                                            checkpointDatas.data());
151             }
152 
153             std::set<VkCommandBuffer> startedCommandBuffers;
154             std::set<VkCommandBuffer> finishedCommandBuffers;
155 
156             for (const VkCheckpointDataNV& checkpointData : checkpointDatas) {
157                 const auto& marker =
158                     *reinterpret_cast<const CheckpointMarker*>(checkpointData.pCheckpointMarker);
159                 if (marker.type == MarkerType::kBegin) {
160                     startedCommandBuffers.insert(marker.commandBuffer);
161                 } else {
162                     finishedCommandBuffers.erase(marker.commandBuffer);
163                 }
164             }
165 
166             std::set<VkCommandBuffer> unfinishedCommandBuffersForQueue;
167 
168             std::set_difference(startedCommandBuffers.begin(),                   //
169                                 startedCommandBuffers.end(),                     //
170                                 finishedCommandBuffers.begin(),                  //
171                                 finishedCommandBuffers.end(),                    //
172                                 std::inserter(unfinishedCommandBuffersForQueue,  //
173                                               unfinishedCommandBuffersForQueue.end()));
174 
175             for (const VkCommandBuffer commandBuffer : unfinishedCommandBuffersForQueue) {
176                 unfinishedCommandBuffers.push_back(CommandBufferOnQueue{
177                     .commandBuffer = commandBuffer,
178                     .queue = queue,
179                 });
180             }
181         }
182 
183         if (unfinishedCommandBuffers.empty()) {
184             ERR("VkDevice:%p has no outstanding VkCommandBuffers.", device);
185         } else {
186             ERR("VkDevice:%p has outstanding VkCommandBuffers:", device);
187             for (const CommandBufferOnQueue& unfinished : unfinishedCommandBuffers) {
188                 ERR("   - VkCommandBuffer:%p on VkQueue:%p", unfinished.commandBuffer,
189                     unfinished.queue);
190             }
191         }
192     }
193 
194     ERR("DeviceLostHelper finished lost device checks.");
195 }
196 
197 }  // namespace vk
198 }  // namespace gfxstream