• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2022 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "aemu/base/AndroidHealthMonitor.h"
17 
18 #include <map>
19 #include <sys/time.h>
20 
21 namespace android {
22 namespace base {
23 namespace guest {
24 
25 using android::base::guest::AutoLock;
26 using std::chrono::duration_cast;
27 
28 template <class... Ts>
29 struct MonitoredEventVisitor : Ts... {
30     using Ts::operator()...;
31 };
32 template <class... Ts>
33 MonitoredEventVisitor(Ts...) -> MonitoredEventVisitor<Ts...>;
34 
35 template <class Clock>
HealthMonitor(HealthMonitorConsumer & consumer,uint64_t heartbeatInterval)36 HealthMonitor<Clock>::HealthMonitor(HealthMonitorConsumer& consumer, uint64_t heartbeatInterval)
37     : mInterval(Duration(std::chrono::milliseconds(heartbeatInterval))), mConsumer(consumer) {
38     start();
39 }
40 
41 template <class Clock>
~HealthMonitor()42 HealthMonitor<Clock>::~HealthMonitor() {
43     auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::EndMonitoring{});
44     {
45         AutoLock lock(mLock);
46         mEventQueue.push(std::move(event));
47     }
48     poll();
49     wait();
50 }
51 
52 template <class Clock>
startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,std::optional<std::function<std::unique_ptr<HangAnnotations> ()>> onHangAnnotationsCallback,uint64_t timeout,std::optional<Id> parentId)53 typename HealthMonitor<Clock>::Id HealthMonitor<Clock>::startMonitoringTask(
54     std::unique_ptr<EventHangMetadata> metadata,
55     std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback,
56     uint64_t timeout, std::optional<Id> parentId) {
57     auto intervalMs = duration_cast<std::chrono::milliseconds>(mInterval).count();
58     if (timeout < intervalMs) {
59         ALOGW("Timeout value %llu is too low (heartbeat is every %llu). Increasing to %llu",
60               (unsigned long long)timeout, (unsigned long long) intervalMs,
61               (unsigned long long)intervalMs * 2);
62         timeout = intervalMs * 2;
63     }
64 
65     AutoLock lock(mLock);
66     auto id = mNextId++;
67     auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Start{
68         .id = id,
69         .metadata = std::move(metadata),
70         .timeOccurred = Clock::now(),
71         .onHangAnnotationsCallback = std::move(onHangAnnotationsCallback),
72         .timeoutThreshold = Duration(std::chrono::milliseconds(timeout)),
73         .parentId = parentId});
74     mEventQueue.push(std::move(event));
75     return id;
76 }
77 
78 template <class Clock>
touchMonitoredTask(Id id)79 void HealthMonitor<Clock>::touchMonitoredTask(Id id) {
80     auto event = std::make_unique<MonitoredEvent>(
81         typename MonitoredEventType::Touch{.id = id, .timeOccurred = Clock::now()});
82     AutoLock lock(mLock);
83     mEventQueue.push(std::move(event));
84 }
85 
86 template <class Clock>
stopMonitoringTask(Id id)87 void HealthMonitor<Clock>::stopMonitoringTask(Id id) {
88     auto event = std::make_unique<MonitoredEvent>(
89         typename MonitoredEventType::Stop{.id = id, .timeOccurred = Clock::now()});
90     AutoLock lock(mLock);
91     mEventQueue.push(std::move(event));
92 }
93 
94 template <class Clock>
poll()95 std::future<void> HealthMonitor<Clock>::poll() {
96     auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Poll{});
97     std::future<void> ret =
98         std::get<typename MonitoredEventType::Poll>(*event).complete.get_future();
99 
100     AutoLock lock(mLock);
101     mEventQueue.push(std::move(event));
102     mCv.signalAndUnlock(&lock);
103     return ret;
104 }
105 
106 // Thread's main loop
107 template <class Clock>
main()108 intptr_t HealthMonitor<Clock>::main() {
109     bool keepMonitoring = true;
110     std::queue<std::unique_ptr<MonitoredEvent>> events;
111 
112     while (keepMonitoring) {
113         std::vector<std::promise<void>> pollPromises;
114         std::unordered_set<Id> tasksToRemove;
115         int newHungTasks = mHungTasks;
116         {
117             AutoLock lock(mLock);
118             struct timeval currentTime;
119             gettimeofday(&currentTime, 0);
120             if (mEventQueue.empty()) {
121                 mCv.timedWait(
122                     &mLock,
123                     currentTime.tv_sec * 1000000LL + currentTime.tv_usec +
124                         std::chrono::duration_cast<std::chrono::microseconds>(mInterval).count());
125             }
126             mEventQueue.swap(events);
127         }
128 
129         Timestamp now = Clock::now();
130         while (!events.empty()) {
131             auto event(std::move(events.front()));
132             events.pop();
133 
134             std::visit(MonitoredEventVisitor{
135                            [](std::monostate& event) {
136                                ALOGE("MonitoredEvent type not found");
137                                abort();
138                            },
139                            [this, &events](typename MonitoredEventType::Start& event) {
140                                auto it = mMonitoredTasks.find(event.id);
141                                if (it != mMonitoredTasks.end()) {
142                                    ALOGE("Registered multiple start events for task %llu",
143                                          (unsigned long long)event.id);
144                                    return;
145                                }
146                                if (event.parentId && mMonitoredTasks.find(event.parentId.value()) ==
147                                                          mMonitoredTasks.end()) {
148                                    ALOGW("Requested parent task %llu does not exist.",
149                                          (unsigned long long)event.parentId.value());
150                                    event.parentId = std::nullopt;
151                                }
152                                it = mMonitoredTasks
153                                         .emplace(event.id,
154                                                  std::move(MonitoredTask{
155                                                      .id = event.id,
156                                                      .timeoutTimestamp = event.timeOccurred +
157                                                                          event.timeoutThreshold,
158                                                      .timeoutThreshold = event.timeoutThreshold,
159                                                      .hungTimestamp = std::nullopt,
160                                                      .metadata = std::move(event.metadata),
161                                                      .onHangAnnotationsCallback =
162                                                          std::move(event.onHangAnnotationsCallback),
163                                                      .parentId = event.parentId}))
164                                         .first;
165                                updateTaskParent(events, it->second, event.timeOccurred);
166                            },
167                            [this, &events](typename MonitoredEventType::Touch& event) {
168                                auto it = mMonitoredTasks.find(event.id);
169                                if (it == mMonitoredTasks.end()) {
170                                    ALOGE("HealthMonitor has no task in progress for id %llu",
171                                          (unsigned long long)event.id);
172                                    return;
173                                }
174 
175                                auto& task = it->second;
176                                task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
177                                updateTaskParent(events, task, event.timeOccurred);
178                            },
179                            [this, &tasksToRemove,
180                             &events](typename MonitoredEventType::Stop& event) {
181                                auto it = mMonitoredTasks.find(event.id);
182                                if (it == mMonitoredTasks.end()) {
183                                    ALOGE("HealthMonitor has no task in progress for id %llu",
184                                          (unsigned long long)event.id);
185                                    return;
186                                }
187 
188                                auto& task = it->second;
189                                task.timeoutTimestamp = event.timeOccurred + task.timeoutThreshold;
190                                updateTaskParent(events, task, event.timeOccurred);
191 
192                                // Mark it for deletion, but retain it until the end of
193                                // the health check concurrent tasks hung
194                                tasksToRemove.insert(event.id);
195                            },
196                            [&keepMonitoring](typename MonitoredEventType::EndMonitoring& event) {
197                                keepMonitoring = false;
198                            },
199                            [&pollPromises](typename MonitoredEventType::Poll& event) {
200                                pollPromises.push_back(std::move(event.complete));
201                            }},
202                        *event);
203         }
204 
205         // Sort by what times out first. Identical timestamps are possible
206         std::multimap<Timestamp, uint64_t> sortedTasks;
207         for (auto& [_, task] : mMonitoredTasks) {
208             sortedTasks.insert(std::pair<Timestamp, uint64_t>(task.timeoutTimestamp, task.id));
209         }
210 
211         for (auto& [_, task_id] : sortedTasks) {
212             auto& task = mMonitoredTasks[task_id];
213             if (task.timeoutTimestamp < now) {
214                 // Newly hung task
215                 if (!task.hungTimestamp.has_value()) {
216                     // Copy over additional annotations captured at hangTime
217                     if (task.onHangAnnotationsCallback) {
218                         auto newAnnotations = (*task.onHangAnnotationsCallback)();
219                         task.metadata->mergeAnnotations(std::move(newAnnotations));
220                     }
221                     mConsumer.consumeHangEvent(task.id, task.metadata.get(), newHungTasks);
222                     task.hungTimestamp = task.timeoutTimestamp;
223                     newHungTasks++;
224                 }
225             } else {
226                 // Task resumes
227                 if (task.hungTimestamp.has_value()) {
228                     auto hangTime = duration_cast<std::chrono::milliseconds>(
229                                         task.timeoutTimestamp -
230                                         (task.hungTimestamp.value() + task.timeoutThreshold))
231                                         .count();
232                     mConsumer.consumeUnHangEvent(task.id, task.metadata.get(), hangTime);
233                     task.hungTimestamp = std::nullopt;
234                     newHungTasks--;
235                 }
236             }
237             if (tasksToRemove.find(task_id) != tasksToRemove.end()) {
238                 mMonitoredTasks.erase(task_id);
239             }
240         }
241 
242         if (mHungTasks != newHungTasks) {
243             ALOGE("HealthMonitor: Number of unresponsive tasks %s: %d -> %d",
244                 mHungTasks < newHungTasks ? "increased" : "decreaased", mHungTasks, newHungTasks);
245             mHungTasks = newHungTasks;
246         }
247 
248         for (auto& complete : pollPromises) {
249             complete.set_value();
250         }
251     }
252 
253     return 0;
254 }
255 
256 template <class Clock>
updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>> & events,const MonitoredTask & task,Timestamp eventTime)257 void HealthMonitor<Clock>::updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
258                                             const MonitoredTask& task, Timestamp eventTime) {
259     std::optional<Id> parentId = task.parentId;
260     if (parentId) {
261         auto event = std::make_unique<MonitoredEvent>(typename MonitoredEventType::Touch{
262             .id = parentId.value(), .timeOccurred = eventTime + Duration(kTimeEpsilon)});
263         events.push(std::move(event));
264     }
265 }
266 
CreateHealthMonitor(HealthMonitorConsumer & consumer,uint64_t heartbeatInterval)267 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor(HealthMonitorConsumer& consumer,
268                                                      uint64_t heartbeatInterval) {
269 #ifdef ENABLE_ANDROID_HEALTH_MONITOR
270     ALOGI("HealthMonitor enabled. Returning monitor.");
271     return std::make_unique<HealthMonitor<>>(consumer, heartbeatInterval);
272 #else
273     ALOGI("HealthMonitor disabled. Returning nullptr");
274     return nullptr;
275 #endif
276 }
277 
278 template class HealthMonitor<steady_clock>;
279 
280 }  // namespace guest
281 }  // namespace base
282 }  // namespace android
283