1 /* 2 * Copyright (C) 2022 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #pragma once 17 18 #include <chrono> 19 #include <functional> 20 #include <future> 21 #include <optional> 22 #include <queue> 23 #include <stack> 24 #include <string> 25 #include <type_traits> 26 #include <unordered_map> 27 #include <unordered_set> 28 #include <variant> 29 #include <utility> 30 31 #include "aemu/base/AndroidHealthMonitorConsumer.h" 32 #include "aemu/base/synchronization/AndroidConditionVariable.h" 33 #include "aemu/base/synchronization/AndroidLock.h" 34 #include "aemu/base/threads/AndroidThread.h" 35 36 #include <log/log.h> 37 38 using android::base::guest::EventHangMetadata; 39 40 #define WATCHDOG_BUILDER(healthMonitorPtr, msg) \ 41 ::android::base::guest::HealthWatchdogBuilder<std::decay_t<decltype(*(healthMonitorPtr))>>( \ 42 (healthMonitorPtr), __FILE__, __func__, msg, __LINE__) 43 44 namespace android { 45 namespace base { 46 namespace guest { 47 48 using android::base::guest::ConditionVariable; 49 using android::base::guest::Lock; 50 using std::chrono::duration; 51 using std::chrono::steady_clock; 52 using std::chrono::time_point; 53 using HangAnnotations = EventHangMetadata::HangAnnotations; 54 55 static uint64_t kDefaultIntervalMs = 1'000; 56 static uint64_t kDefaultTimeoutMs = 5'000; 57 static std::chrono::nanoseconds kTimeEpsilon(1); 58 59 // HealthMonitor provides the ability to register arbitrary start/touch/stop events associated 60 // with client defined tasks. At some pre-defined interval, it will periodically consume 61 // all logged events to assess whether the system is hanging on any task. Via the 62 // HealthMonitorConsumer, it will log hang and unhang events when it detects tasks hanging/resuming. 63 // Design doc: http://go/gfxstream-health-monitor 64 template <class Clock = steady_clock> 65 class HealthMonitor : public android::base::guest::Thread { 66 public: 67 // Alias for task id. 68 using Id = uint64_t; 69 70 // Constructor 71 // `heatbeatIntervalMs` is the interval, in milleseconds, that the thread will sleep for 72 // in between health checks. 73 HealthMonitor(HealthMonitorConsumer& consumer, uint64_t heartbeatInterval = kDefaultIntervalMs); 74 75 // Destructor 76 // Enqueues an event to end monitoring and waits on thread to process remaining queued events. 77 ~HealthMonitor(); 78 79 // Start monitoring a task. Returns an id that is used for touch and stop operations. 80 // `metadata` is a struct containing info on the task watchdog to be passed through to the 81 // metrics logger. 82 // `onHangAnnotationsCallback` is an optional containing a callable that will return key-value 83 // string pairs to be recorded at the time a hang is detected, which is useful for debugging. 84 // `timeout` is the duration in milliseconds a task is allowed to run before it's 85 // considered "hung". Because `timeout` must be larger than the monitor's heartbeat 86 // interval, as shorter timeout periods would not be detected, this method will set actual 87 // timeout to the lesser of `timeout` and twice the heartbeat interval. 88 // `parentId` can be the Id of another task. Events in this monitored task will update 89 // the parent task recursively. 90 Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata, 91 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> 92 onHangAnnotationsCallback = std::nullopt, 93 uint64_t timeout = kDefaultTimeoutMs, 94 std::optional<Id> parentId = std::nullopt); 95 96 // Touch a monitored task. Resets the timeout countdown for that task. 97 void touchMonitoredTask(Id id); 98 99 // Stop monitoring a task. 100 void stopMonitoringTask(Id id); 101 102 private: 103 using Duration = typename Clock::duration; // duration<double>; 104 using Timestamp = time_point<Clock, Duration>; 105 106 // Allow test class access to private functions 107 friend class HealthMonitorTest; 108 109 struct MonitoredEventType { 110 struct Start { 111 Id id; 112 std::unique_ptr<EventHangMetadata> metadata; 113 Timestamp timeOccurred; 114 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> 115 onHangAnnotationsCallback; 116 Duration timeoutThreshold; 117 std::optional<Id> parentId; 118 }; 119 struct Touch { 120 Id id; 121 Timestamp timeOccurred; 122 }; 123 struct Stop { 124 Id id; 125 Timestamp timeOccurred; 126 }; 127 struct EndMonitoring {}; 128 struct Poll { 129 std::promise<void> complete; 130 }; 131 }; 132 133 using MonitoredEvent = 134 std::variant<std::monostate, typename MonitoredEventType::Start, 135 typename MonitoredEventType::Touch, typename MonitoredEventType::Stop, 136 typename MonitoredEventType::EndMonitoring, typename MonitoredEventType::Poll>; 137 138 struct MonitoredTask { 139 Id id; 140 Timestamp timeoutTimestamp; 141 Duration timeoutThreshold; 142 std::optional<Timestamp> hungTimestamp; 143 std::unique_ptr<EventHangMetadata> metadata; 144 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback; 145 std::optional<Id> parentId; 146 }; 147 148 // Thread's main loop 149 intptr_t main() override; 150 151 // Update the parent task 152 void updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events, 153 const MonitoredTask& task, Timestamp eventTime); 154 155 // Explicitly wake the monitor thread. Returns a future that can be used to wait until the 156 // poll event has been processed. 157 std::future<void> poll(); 158 159 // Immutable. Multi-thread access is safe. 160 const Duration mInterval; 161 162 // Members accessed only on the worker thread. Not protected by mutex. 163 int mHungTasks = 0; 164 HealthMonitorConsumer& mConsumer; 165 std::unordered_map<Id, MonitoredTask> mMonitoredTasks; 166 167 // Lock and cv control access to queue and id counter 168 ConditionVariable mCv; 169 Lock mLock; 170 Id mNextId = 0; 171 std::queue<std::unique_ptr<MonitoredEvent>> mEventQueue; 172 }; 173 174 // This class provides an RAII mechanism for monitoring a task. 175 // HealthMonitorT should have the exact same interface as HealthMonitor. Note that HealthWatchdog 176 // can be used in performance critical path, so we use a template to dispatch a call here to 177 // overcome the performance cost of virtual function dispatch. 178 template <class HealthMonitorT = HealthMonitor<>> 179 class HealthWatchdog { 180 public: 181 HealthWatchdog(HealthMonitorT* healthMonitor, std::unique_ptr<EventHangMetadata> metadata, 182 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> 183 onHangAnnotationsCallback = std::nullopt, 184 uint64_t timeout = kDefaultTimeoutMs) mHealthMonitor(healthMonitor)185 : mHealthMonitor(healthMonitor), mThreadId(getCurrentThreadId()) { 186 if (!mHealthMonitor) { 187 mId = std::nullopt; 188 return; 189 } 190 // TODO: willho@ re-enable thread awareness b/253483619 191 typename HealthMonitorT::Id id = mHealthMonitor->startMonitoringTask( 192 std::move(metadata), std::move(onHangAnnotationsCallback), timeout, std::nullopt); 193 mId = id; 194 } 195 ~HealthWatchdog()196 ~HealthWatchdog() { 197 if (!mId.has_value()) { 198 return; 199 } 200 mHealthMonitor->stopMonitoringTask(*mId); 201 } 202 touch()203 void touch() { 204 if (!mId.has_value()) { 205 return; 206 } 207 mHealthMonitor->touchMonitoredTask(*mId); 208 } 209 210 // Return the underlying Id, and don't issue a stop on destruction. release()211 std::optional<typename HealthMonitorT::Id> release() { 212 return std::exchange(mId, std::nullopt); 213 } 214 215 private: 216 using ThreadTasks = 217 std::unordered_map<HealthMonitorT*, std::stack<typename HealthMonitorT::Id>>; 218 std::optional<typename HealthMonitorT::Id> mId; 219 HealthMonitorT* mHealthMonitor; 220 const unsigned long mThreadId; 221 }; 222 223 // HealthMonitorT should have the exact same interface as HealthMonitor. This template parameter is 224 // used for injecting a different type for testing. 225 template <class HealthMonitorT> 226 class HealthWatchdogBuilder { 227 public: HealthWatchdogBuilder(HealthMonitorT * healthMonitor,const char * fileName,const char * functionName,const char * message,uint32_t line)228 HealthWatchdogBuilder(HealthMonitorT* healthMonitor, const char* fileName, 229 const char* functionName, const char* message, uint32_t line) 230 : mHealthMonitor(healthMonitor), 231 mMetadata(std::make_unique<EventHangMetadata>( 232 fileName, functionName, message, line, EventHangMetadata::HangType::kOther, nullptr)), 233 mTimeoutMs(kDefaultTimeoutMs), 234 mOnHangCallback(std::nullopt) {} 235 236 DISALLOW_COPY_ASSIGN_AND_MOVE(HealthWatchdogBuilder); 237 setHangType(EventHangMetadata::HangType hangType)238 HealthWatchdogBuilder& setHangType(EventHangMetadata::HangType hangType) { 239 if (mHealthMonitor) mMetadata->hangType = hangType; 240 return *this; 241 } setTimeoutMs(uint32_t timeoutMs)242 HealthWatchdogBuilder& setTimeoutMs(uint32_t timeoutMs) { 243 if (mHealthMonitor) mTimeoutMs = timeoutMs; 244 return *this; 245 } 246 // F should be a callable that returns a std::unique_ptr<EventHangMetadata::HangAnnotations>. We 247 // use template instead of std::function here to avoid extra copy. 248 template <class F> setOnHangCallback(F && callback)249 HealthWatchdogBuilder& setOnHangCallback(F&& callback) { 250 if (mHealthMonitor) { 251 mOnHangCallback = 252 std::function<std::unique_ptr<HangAnnotations>()>(std::forward<F>(callback)); 253 } 254 return *this; 255 } 256 setAnnotations(std::unique_ptr<HangAnnotations> annotations)257 HealthWatchdogBuilder& setAnnotations(std::unique_ptr<HangAnnotations> annotations) { 258 if (mHealthMonitor) mMetadata->data = std::move(annotations); 259 return *this; 260 } 261 build()262 std::unique_ptr<HealthWatchdog<HealthMonitorT>> build() { 263 // We are allocating on the heap, so there is a performance hit. However we also allocate 264 // EventHangMetadata on the heap, so this should be Ok. If we see performance issues with 265 // these allocations, for HealthWatchdog, we can always use placement new + noop deleter to 266 // avoid heap allocation for HealthWatchdog. 267 return std::make_unique<HealthWatchdog<HealthMonitorT>>( 268 mHealthMonitor, std::move(mMetadata), std::move(mOnHangCallback), mTimeoutMs); 269 } 270 271 private: 272 HealthMonitorT* mHealthMonitor; 273 std::unique_ptr<EventHangMetadata> mMetadata; 274 uint32_t mTimeoutMs; 275 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> mOnHangCallback; 276 }; 277 278 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor( 279 HealthMonitorConsumer& consumer, uint64_t heartbeatInterval = kDefaultIntervalMs); 280 281 } // namespace guest 282 } // namespace base 283 } // namespace android 284