1 // Copyright 2024 The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expresso or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "DeviceLostHelper.h"
16
17 #include <algorithm>
18 #include <iterator>
19 #include <set>
20
21 #include "host-common/logging.h"
22
23 namespace gfxstream {
24 namespace vk {
25
enableWithNvidiaDeviceDiagnosticCheckpoints()26 void DeviceLostHelper::enableWithNvidiaDeviceDiagnosticCheckpoints() { mEnabled = true; }
27
createMarkerForCommandBuffer(const VkCommandBuffer & commandBuffer,MarkerType type)28 const void* DeviceLostHelper::createMarkerForCommandBuffer(const VkCommandBuffer& commandBuffer,
29 MarkerType type) {
30 std::lock_guard<std::mutex> lock(mMarkersMutex);
31
32 auto it = mMarkers.insert(CheckpointMarker{commandBuffer, type});
33
34 // References and pointers to data stored in the container are only
35 // invalidated by erasing that element, even when the corresponding
36 // iterator is invalidated.
37 return reinterpret_cast<const void*>(&(*it.first));
38 }
39
removeMarkersForCommandBuffer(const VkCommandBuffer & commandBuffer)40 void DeviceLostHelper::removeMarkersForCommandBuffer(const VkCommandBuffer& commandBuffer) {
41 std::lock_guard<std::mutex> lock(mMarkersMutex);
42 mMarkers.erase(CheckpointMarker{
43 .commandBuffer = commandBuffer,
44 .type = MarkerType::kBegin,
45 });
46 mMarkers.erase(CheckpointMarker{
47 .commandBuffer = commandBuffer,
48 .type = MarkerType::kEnd,
49 });
50 }
51
addNeededDeviceExtensions(std::vector<const char * > * deviceExtensions)52 void DeviceLostHelper::addNeededDeviceExtensions(std::vector<const char*>* deviceExtensions) {
53 if (mEnabled) {
54 deviceExtensions->push_back(VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME);
55 }
56 }
57
onDeviceCreated(const DeviceWithQueues & deviceInfo)58 void DeviceLostHelper::onDeviceCreated(const DeviceWithQueues& deviceInfo) {
59 if (!mEnabled) {
60 return;
61 }
62
63 std::lock_guard<std::mutex> lock(mDevicesMutex);
64 mDevices[deviceInfo.device] = deviceInfo;
65 }
66
onDeviceDestroyed(VkDevice device)67 void DeviceLostHelper::onDeviceDestroyed(VkDevice device) {
68 if (!mEnabled) {
69 return;
70 }
71
72 std::lock_guard<std::mutex> lock(mDevicesMutex);
73 mDevices.erase(device);
74 }
75
onBeginCommandBuffer(const VkCommandBuffer & commandBuffer,const VulkanDispatch * vk)76 void DeviceLostHelper::onBeginCommandBuffer(const VkCommandBuffer& commandBuffer,
77 const VulkanDispatch* vk) {
78 if (!mEnabled) {
79 return;
80 }
81
82 const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kBegin);
83 vk->vkCmdSetCheckpointNV(commandBuffer, marker);
84 }
85
onEndCommandBuffer(const VkCommandBuffer & commandBuffer,const VulkanDispatch * vk)86 void DeviceLostHelper::onEndCommandBuffer(const VkCommandBuffer& commandBuffer,
87 const VulkanDispatch* vk) {
88 if (!mEnabled) {
89 return;
90 }
91
92 const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kEnd);
93 vk->vkCmdSetCheckpointNV(commandBuffer, marker);
94 }
95
onResetCommandBuffer(const VkCommandBuffer & commandBuffer)96 void DeviceLostHelper::onResetCommandBuffer(const VkCommandBuffer& commandBuffer) {
97 if (!mEnabled) {
98 return;
99 }
100
101 removeMarkersForCommandBuffer(commandBuffer);
102 }
103
onFreeCommandBuffer(const VkCommandBuffer & commandBuffer)104 void DeviceLostHelper::onFreeCommandBuffer(const VkCommandBuffer& commandBuffer) {
105 if (!mEnabled) {
106 return;
107 }
108
109 removeMarkersForCommandBuffer(commandBuffer);
110 }
111
onDeviceLost()112 void DeviceLostHelper::onDeviceLost() {
113 if (!mEnabled) {
114 return;
115 }
116
117 ERR("DeviceLostHelper starting lost device checks...");
118
119 std::lock_guard<std::mutex> deviceLock(mDevicesMutex);
120
121 for (const auto& [device, deviceWithQueues] : mDevices) {
122 const auto* deviceDispatch = deviceWithQueues.deviceDispatch;
123 if (deviceDispatch->vkDeviceWaitIdle(device) != VK_ERROR_DEVICE_LOST) {
124 continue;
125 }
126 ERR("VkDevice:%p was lost, checking for unfinished VkCommandBuffers...", device);
127
128 struct CommandBufferOnQueue {
129 VkCommandBuffer commandBuffer = VK_NULL_HANDLE;
130 VkQueue queue = VK_NULL_HANDLE;
131 };
132 std::vector<CommandBufferOnQueue> unfinishedCommandBuffers;
133
134 for (const QueueWithMutex& queueInfo : deviceWithQueues.queues) {
135 VkQueue queue = queueInfo.queue;
136
137 std::vector<VkCheckpointDataNV> checkpointDatas;
138 {
139 std::lock_guard<std::mutex> queueLock(*queueInfo.queueMutex);
140
141 uint32_t checkpointDataCount = 0;
142 deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount, nullptr);
143 if (checkpointDataCount == 0) continue;
144
145 checkpointDatas.resize(static_cast<size_t>(checkpointDataCount),
146 VkCheckpointDataNV{
147 .sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV,
148 });
149 deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount,
150 checkpointDatas.data());
151 }
152
153 std::set<VkCommandBuffer> startedCommandBuffers;
154 std::set<VkCommandBuffer> finishedCommandBuffers;
155
156 for (const VkCheckpointDataNV& checkpointData : checkpointDatas) {
157 const auto& marker =
158 *reinterpret_cast<const CheckpointMarker*>(checkpointData.pCheckpointMarker);
159 if (marker.type == MarkerType::kBegin) {
160 startedCommandBuffers.insert(marker.commandBuffer);
161 } else {
162 finishedCommandBuffers.erase(marker.commandBuffer);
163 }
164 }
165
166 std::set<VkCommandBuffer> unfinishedCommandBuffersForQueue;
167
168 std::set_difference(startedCommandBuffers.begin(), //
169 startedCommandBuffers.end(), //
170 finishedCommandBuffers.begin(), //
171 finishedCommandBuffers.end(), //
172 std::inserter(unfinishedCommandBuffersForQueue, //
173 unfinishedCommandBuffersForQueue.end()));
174
175 for (const VkCommandBuffer commandBuffer : unfinishedCommandBuffersForQueue) {
176 unfinishedCommandBuffers.push_back(CommandBufferOnQueue{
177 .commandBuffer = commandBuffer,
178 .queue = queue,
179 });
180 }
181 }
182
183 if (unfinishedCommandBuffers.empty()) {
184 ERR("VkDevice:%p has no outstanding VkCommandBuffers.", device);
185 } else {
186 ERR("VkDevice:%p has outstanding VkCommandBuffers:", device);
187 for (const CommandBufferOnQueue& unfinished : unfinishedCommandBuffers) {
188 ERR(" - VkCommandBuffer:%p on VkQueue:%p", unfinished.commandBuffer,
189 unfinished.queue);
190 }
191 }
192 }
193
194 ERR("DeviceLostHelper finished lost device checks.");
195 }
196
197 } // namespace vk
198 } // namespace gfxstream