1 /*
2 * Copyright (C) 2022 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "aemu/base/AndroidHealthMonitor.h"
17
18 #include <map>
19 #include <sys/time.h>
20
21 namespace android {
22 namespace base {
23 namespace guest {
24
25 using android::base::guest::AutoLock;
26 using std::chrono::duration_cast;
27
28 template <class... Ts>
29 struct MonitoredEventVisitor : Ts... {
30 using Ts::operator()...;
31 };
32 template <class... Ts>
33 MonitoredEventVisitor(Ts...) -> MonitoredEventVisitor<Ts...>;
34
35 template <class Clock>
HealthMonitor(HealthMonitorConsumer & consumer,uint64_t heartbeatInterval)36 HealthMonitor<Clock>::HealthMonitor(HealthMonitorConsumer& consumer, uint64_t heartbeatInterval)
37 : mInterval(Duration(std::chrono::milliseconds(heartbeatInterval))), mConsumer(consumer) {
38 start();
39 }
40
41 template <class Clock>
~HealthMonitor()42 HealthMonitor<Clock>::~HealthMonitor() {
43 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::EndMonitoring{});
44 {
45 AutoLock lock(mLock);
46 mEventQueue.push(std::move(event));
47 }
48 poll();
49 wait();
50 }
51
52 template <class Clock>
startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,std::optional<std::function<std::unique_ptr<HangAnnotations> ()>> onHangAnnotationsCallback,uint64_t timeout,std::optional<Id> parentId)53 typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask(
54 std::unique_ptr<EventHangMetadata> metadata,
55 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback,
56 uint64_t timeout, std::optional<Id> parentId) {
57 auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count();
58 if (timeout < intervalMs) {
59 ALOGW("Timeout value %llu is too low (heartbeat is every %llu). Increasing to %llu",
60 (unsigned long long)timeout, (unsigned long long) intervalMs,
61 (unsigned long long)intervalMs * 2);
62 timeout = intervalMs * 2;
63 }
64
65 AutoLock lock(mLock);
66 auto id = mNextId++;
67 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Start{
68 .id = id,
69 .metadata = std::move(metadata),
70 .timeOccurred = Clock::now(),
71 .onHangAnnotationsCallback = std::move(onHangAnnotationsCallback),
72 .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)),
73 .parentId = parentId});
74 mEventQueue.push(std::move(event));
75 return id;
76 }
77
78 template <class Clock>
touchMonitoredTask(Id id)79 void HealthMonitor<Clock>::touchMonitoredTask(Id id) {
80 auto event = std::make_unique<MonitoredEvent>(
81 typename MonitoredEventType::Touch{.id = id, .timeOccurred = Clock::now()});
82 AutoLock lock(mLock);
83 mEventQueue.push(std::move(event));
84 }
85
86 template <class Clock>
stopMonitoringTask(Id id)87 void HealthMonitor<Clock>::stopMonitoringTask(Id id) {
88 auto event = std::make_unique<MonitoredEvent>(
89 typename MonitoredEventType::Stop{.id = id, .timeOccurred = Clock::now()});
90 AutoLock lock(mLock);
91 mEventQueue.push(std::move(event));
92 }
93
94 template <class Clock>
poll()95 std::future<void> HealthMonitor<Clock>::poll() {
96 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Poll{});
97 std::future<void> ret =
98 std::get<typename MonitoredEventType::Poll>(*event).complete.get_future();
99
100 AutoLock lock(mLock);
101 mEventQueue.push(std::move(event));
102 mCv.signalAndUnlock(&lock);
103 return ret;
104 }
105
106 // Thread's main loop
107 template <class Clock>
main()108 intptr_t HealthMonitor<Clock>::main() {
109 bool keepMonitoring = true;
110 std::queue<std::unique_ptr<MonitoredEvent>> events;
111
112 while (keepMonitoring) {
113 std::vector<std::promise<void>> pollPromises;
114 std::unordered_set<Id> tasksToRemove;
115 int newHungTasks = mHungTasks;
116 {
117 AutoLock lock(mLock);
118 struct timeval currentTime;
119 gettimeofday(¤tTime, 0);
120 if (mEventQueue.empty()) {
121 mCv.timedWait(
122 &mLock,
123 currentTime.tv_sec * 1000000LL + currentTime.tv_usec +
124 std::chrono::duration_cast<std::chrono::microseconds>(mInterval).count());
125 }
126 mEventQueue.swap(events);
127 }
128
129 Timestamp now = Clock::now();
130 while (!events.empty()) {
131 auto event(std::move(events.front()));
132 events.pop();
133
134 std::visit(MonitoredEventVisitor{
135 [](std::monostate& event) {
136 ALOGE("MonitoredEvent type not found");
137 abort();
138 },
139 [this, &events](typename MonitoredEventType::Start& event) {
140 auto it = mMonitoredTasks.find(event.id);
141 if (it != mMonitoredTasks.end()) {
142 ALOGE("Registered multiple start events for task %llu",
143 (unsigned long long)event.id);
144 return;
145 }
146 if (event.parentId && mMonitoredTasks.find(event.parentId.value()) ==
147 mMonitoredTasks.end()) {
148 ALOGW("Requested parent task %llu does not exist.",
149 (unsigned long long)event.parentId.value());
150 event.parentId = std::nullopt;
151 }
152 it = mMonitoredTasks
153 .emplace(event.id,
154 std::move(MonitoredTask{
155 .id = event.id,
156 .timeoutTimestamp = event.timeOccurred +
157 event.timeoutThreshold,
158 .timeoutThreshold = event.timeoutThreshold,
159 .hungTimestamp = std::nullopt,
160 .metadata = std::move(event.metadata),
161 .onHangAnnotationsCallback =
162 std::move(event.onHangAnnotationsCallback),
163 .parentId = event.parentId}))
164 .first;
165 updateTaskParent(events, it->second, event.timeOccurred);
166 },
167 [this, &events](typename MonitoredEventType::Touch& event) {
168 auto it = mMonitoredTasks.find(event.id);
169 if (it == mMonitoredTasks.end()) {
170 ALOGE("HealthMonitor has no task in progress for id %llu",
171 (unsigned long long)event.id);
172 return;
173 }
174
175 auto& task = it->second;
176 task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
177 updateTaskParent(events, task, event.timeOccurred);
178 },
179 [this, &tasksToRemove,
180 &events](typename MonitoredEventType::Stop& event) {
181 auto it = mMonitoredTasks.find(event.id);
182 if (it == mMonitoredTasks.end()) {
183 ALOGE("HealthMonitor has no task in progress for id %llu",
184 (unsigned long long)event.id);
185 return;
186 }
187
188 auto& task = it->second;
189 task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
190 updateTaskParent(events, task, event.timeOccurred);
191
192 // Mark it for deletion, but retain it until the end of
193 // the health check concurrent tasks hung
194 tasksToRemove.insert(event.id);
195 },
196 [&keepMonitoring](typename MonitoredEventType::EndMonitoring& event) {
197 keepMonitoring = false;
198 },
199 [&pollPromises](typename MonitoredEventType::Poll& event) {
200 pollPromises.push_back(std::move(event.complete));
201 }},
202 *event);
203 }
204
205 // Sort by what times out first. Identical timestamps are possible
206 std::multimap<Timestamp, uint64_t> sortedTasks;
207 for (auto& [_, task] : mMonitoredTasks) {
208 sortedTasks.insert(std::pair<Timestamp, uint64_t>(task.timeoutTimestamp, task.id));
209 }
210
211 for (auto& [_, task_id] : sortedTasks) {
212 auto& task = mMonitoredTasks[task_id];
213 if (task.timeoutTimestamp < now) {
214 // Newly hung task
215 if (!task.hungTimestamp.has_value()) {
216 // Copy over additional annotations captured at hangTime
217 if (task.onHangAnnotationsCallback) {
218 auto newAnnotations = (*task.onHangAnnotationsCallback)();
219 task.metadata->mergeAnnotations(std::move(newAnnotations));
220 }
221 mConsumer.consumeHangEvent(task.id, task.metadata.get(), newHungTasks);
222 task.hungTimestamp = task.timeoutTimestamp;
223 newHungTasks++;
224 }
225 } else {
226 // Task resumes
227 if (task.hungTimestamp.has_value()) {
228 auto hangTime = duration_cast<std::chrono::milliseconds>(
229 task.timeoutTimestamp -
230 (task.hungTimestamp.value() + task.timeoutThreshold))
231 .count();
232 mConsumer.consumeUnHangEvent(task.id, task.metadata.get(), hangTime);
233 task.hungTimestamp = std::nullopt;
234 newHungTasks--;
235 }
236 }
237 if (tasksToRemove.find(task_id) != tasksToRemove.end()) {
238 mMonitoredTasks.erase(task_id);
239 }
240 }
241
242 if (mHungTasks != newHungTasks) {
243 ALOGE("HealthMonitor: Number of unresponsive tasks %s: %d -> %d",
244 mHungTasks < newHungTasks ? "increased" : "decreaased", mHungTasks, newHungTasks);
245 mHungTasks = newHungTasks;
246 }
247
248 for (auto& complete : pollPromises) {
249 complete.set_value();
250 }
251 }
252
253 return 0;
254 }
255
256 template <class Clock>
updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>> & events,const MonitoredTask & task,Timestamp eventTime)257 void HealthMonitor<Clock>::updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
258 const MonitoredTask& task, Timestamp eventTime) {
259 std::optional<Id> parentId = task.parentId;
260 if (parentId) {
261 auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Touch{
262 .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)});
263 events.push(std::move(event));
264 }
265 }
266
CreateHealthMonitor(HealthMonitorConsumer & consumer,uint64_t heartbeatInterval)267 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor(HealthMonitorConsumer& consumer,
268 uint64_t heartbeatInterval) {
269 #ifdef ENABLE_ANDROID_HEALTH_MONITOR
270 ALOGI("HealthMonitor enabled. Returning monitor.");
271 return std::make_unique<HealthMonitor<>>(consumer, heartbeatInterval);
272 #else
273 ALOGI("HealthMonitor disabled. Returning nullptr");
274 return nullptr;
275 #endif
276 }
277
278 template class HealthMonitor<steady_clock>;
279
280 } // namespace guest
281 } // namespace base
282 } // namespace android
283