1 /* 2 * Copyright (C) 2022 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #pragma once 17 18 #include <chrono> 19 #include <functional> 20 #include <future> 21 #include <optional> 22 #include <queue> 23 #include <stack> 24 #include <string> 25 #include <type_traits> 26 #include <unordered_map> 27 #include <unordered_set> 28 #include <variant> 29 #include <utility> 30 31 #include "aemu/base/AndroidHealthMonitorConsumer.h" 32 #include "aemu/base/synchronization/AndroidConditionVariable.h" 33 #include "aemu/base/synchronization/AndroidLock.h" 34 #include "aemu/base/threads/AndroidThread.h" 35 36 #include <log/log.h> 37 38 using gfxstream::guest::EventHangMetadata; 39 40 #define WATCHDOG_BUILDER(healthMonitorPtr, msg) \ 41 gfxstream::guest::HealthWatchdogBuilder<std::decay_t<decltype(*(healthMonitorPtr))>>( \ 42 (healthMonitorPtr), __FILE__, __func__, msg, __LINE__) 43 44 namespace gfxstream { 45 namespace guest { 46 47 using gfxstream::guest::ConditionVariable; 48 using gfxstream::guest::Lock; 49 using std::chrono::duration; 50 using std::chrono::steady_clock; 51 using std::chrono::time_point; 52 using HangAnnotations = EventHangMetadata::HangAnnotations; 53 54 static uint64_t kDefaultIntervalMs = 1'000; 55 static uint64_t kDefaultTimeoutMs = 5'000; 56 static std::chrono::nanoseconds kTimeEpsilon(1); 57 58 // HealthMonitor provides the ability to register arbitrary start/touch/stop events associated 59 // with client defined tasks. At some pre-defined interval, it will periodically consume 60 // all logged events to assess whether the system is hanging on any task. Via the 61 // HealthMonitorConsumer, it will log hang and unhang events when it detects tasks hanging/resuming. 62 // Design doc: http://go/gfxstream-health-monitor 63 template <class Clock = steady_clock> 64 class HealthMonitor : public gfxstream::guest::Thread { 65 public: 66 // Alias for task id. 67 using Id = uint64_t; 68 69 // Constructor 70 // `heatbeatIntervalMs` is the interval, in milleseconds, that the thread will sleep for 71 // in between health checks. 72 HealthMonitor(HealthMonitorConsumer& consumer, uint64_t heartbeatInterval = kDefaultIntervalMs); 73 74 // Destructor 75 // Enqueues an event to end monitoring and waits on thread to process remaining queued events. 76 ~HealthMonitor(); 77 78 // Start monitoring a task. Returns an id that is used for touch and stop operations. 79 // `metadata` is a struct containing info on the task watchdog to be passed through to the 80 // metrics logger. 81 // `onHangAnnotationsCallback` is an optional containing a callable that will return key-value 82 // string pairs to be recorded at the time a hang is detected, which is useful for debugging. 83 // `timeout` is the duration in milliseconds a task is allowed to run before it's 84 // considered "hung". Because `timeout` must be larger than the monitor's heartbeat 85 // interval, as shorter timeout periods would not be detected, this method will set actual 86 // timeout to the lesser of `timeout` and twice the heartbeat interval. 87 // `parentId` can be the Id of another task. Events in this monitored task will update 88 // the parent task recursively. 89 Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata, 90 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> 91 onHangAnnotationsCallback = std::nullopt, 92 uint64_t timeout = kDefaultTimeoutMs, 93 std::optional<Id> parentId = std::nullopt); 94 95 // Touch a monitored task. Resets the timeout countdown for that task. 96 void touchMonitoredTask(Id id); 97 98 // Stop monitoring a task. 99 void stopMonitoringTask(Id id); 100 101 private: 102 using Duration = typename Clock::duration; // duration<double>; 103 using Timestamp = time_point<Clock, Duration>; 104 105 // Allow test class access to private functions 106 friend class HealthMonitorTest; 107 108 struct MonitoredEventType { 109 struct Start { 110 Id id; 111 std::unique_ptr<EventHangMetadata> metadata; 112 Timestamp timeOccurred; 113 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> 114 onHangAnnotationsCallback; 115 Duration timeoutThreshold; 116 std::optional<Id> parentId; 117 }; 118 struct Touch { 119 Id id; 120 Timestamp timeOccurred; 121 }; 122 struct Stop { 123 Id id; 124 Timestamp timeOccurred; 125 }; 126 struct EndMonitoring {}; 127 struct Poll { 128 std::promise<void> complete; 129 }; 130 }; 131 132 using MonitoredEvent = 133 std::variant<std::monostate, typename MonitoredEventType::Start, 134 typename MonitoredEventType::Touch, typename MonitoredEventType::Stop, 135 typename MonitoredEventType::EndMonitoring, typename MonitoredEventType::Poll>; 136 137 struct MonitoredTask { 138 Id id; 139 Timestamp timeoutTimestamp; 140 Duration timeoutThreshold; 141 std::optional<Timestamp> hungTimestamp; 142 std::unique_ptr<EventHangMetadata> metadata; 143 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback; 144 std::optional<Id> parentId; 145 }; 146 147 // Thread's main loop 148 intptr_t main() override; 149 150 // Update the parent task 151 void updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events, 152 const MonitoredTask& task, Timestamp eventTime); 153 154 // Explicitly wake the monitor thread. Returns a future that can be used to wait until the 155 // poll event has been processed. 156 std::future<void> poll(); 157 158 // Immutable. Multi-thread access is safe. 159 const Duration mInterval; 160 161 // Members accessed only on the worker thread. Not protected by mutex. 162 int mHungTasks = 0; 163 HealthMonitorConsumer& mConsumer; 164 std::unordered_map<Id, MonitoredTask> mMonitoredTasks; 165 166 // Lock and cv control access to queue and id counter 167 ConditionVariable mCv; 168 Lock mLock; 169 Id mNextId = 0; 170 std::queue<std::unique_ptr<MonitoredEvent>> mEventQueue; 171 }; 172 173 // This class provides an RAII mechanism for monitoring a task. 174 // HealthMonitorT should have the exact same interface as HealthMonitor. Note that HealthWatchdog 175 // can be used in performance critical path, so we use a template to dispatch a call here to 176 // overcome the performance cost of virtual function dispatch. 177 template <class HealthMonitorT = HealthMonitor<>> 178 class HealthWatchdog { 179 public: 180 HealthWatchdog(HealthMonitorT* healthMonitor, std::unique_ptr<EventHangMetadata> metadata, 181 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> 182 onHangAnnotationsCallback = std::nullopt, 183 uint64_t timeout = kDefaultTimeoutMs) mHealthMonitor(healthMonitor)184 : mHealthMonitor(healthMonitor), mThreadId(getCurrentThreadId()) { 185 if (!mHealthMonitor) { 186 mId = std::nullopt; 187 return; 188 } 189 // TODO: willho@ re-enable thread awareness b/253483619 190 typename HealthMonitorT::Id id = mHealthMonitor->startMonitoringTask( 191 std::move(metadata), std::move(onHangAnnotationsCallback), timeout, std::nullopt); 192 mId = id; 193 } 194 ~HealthWatchdog()195 ~HealthWatchdog() { 196 if (!mId.has_value()) { 197 return; 198 } 199 mHealthMonitor->stopMonitoringTask(*mId); 200 } 201 touch()202 void touch() { 203 if (!mId.has_value()) { 204 return; 205 } 206 mHealthMonitor->touchMonitoredTask(*mId); 207 } 208 209 // Return the underlying Id, and don't issue a stop on destruction. release()210 std::optional<typename HealthMonitorT::Id> release() { 211 return std::exchange(mId, std::nullopt); 212 } 213 214 private: 215 using ThreadTasks = 216 std::unordered_map<HealthMonitorT*, std::stack<typename HealthMonitorT::Id>>; 217 std::optional<typename HealthMonitorT::Id> mId; 218 HealthMonitorT* mHealthMonitor; 219 const unsigned long mThreadId; 220 }; 221 222 // HealthMonitorT should have the exact same interface as HealthMonitor. This template parameter is 223 // used for injecting a different type for testing. 224 template <class HealthMonitorT> 225 class HealthWatchdogBuilder { 226 public: HealthWatchdogBuilder(HealthMonitorT * healthMonitor,const char * fileName,const char * functionName,const char * message,uint32_t line)227 HealthWatchdogBuilder(HealthMonitorT* healthMonitor, const char* fileName, 228 const char* functionName, const char* message, uint32_t line) 229 : mHealthMonitor(healthMonitor), 230 mMetadata(std::make_unique<EventHangMetadata>( 231 fileName, functionName, message, line, EventHangMetadata::HangType::kOther, nullptr)), 232 mTimeoutMs(kDefaultTimeoutMs), 233 mOnHangCallback(std::nullopt) {} 234 235 DISALLOW_COPY_ASSIGN_AND_MOVE(HealthWatchdogBuilder); 236 setHangType(EventHangMetadata::HangType hangType)237 HealthWatchdogBuilder& setHangType(EventHangMetadata::HangType hangType) { 238 if (mHealthMonitor) mMetadata->hangType = hangType; 239 return *this; 240 } setTimeoutMs(uint32_t timeoutMs)241 HealthWatchdogBuilder& setTimeoutMs(uint32_t timeoutMs) { 242 if (mHealthMonitor) mTimeoutMs = timeoutMs; 243 return *this; 244 } 245 // F should be a callable that returns a std::unique_ptr<EventHangMetadata::HangAnnotations>. We 246 // use template instead of std::function here to avoid extra copy. 247 template <class F> setOnHangCallback(F && callback)248 HealthWatchdogBuilder& setOnHangCallback(F&& callback) { 249 if (mHealthMonitor) { 250 mOnHangCallback = 251 std::function<std::unique_ptr<HangAnnotations>()>(std::forward<F>(callback)); 252 } 253 return *this; 254 } 255 setAnnotations(std::unique_ptr<HangAnnotations> annotations)256 HealthWatchdogBuilder& setAnnotations(std::unique_ptr<HangAnnotations> annotations) { 257 if (mHealthMonitor) mMetadata->data = std::move(annotations); 258 return *this; 259 } 260 build()261 std::unique_ptr<HealthWatchdog<HealthMonitorT>> build() { 262 // We are allocating on the heap, so there is a performance hit. However we also allocate 263 // EventHangMetadata on the heap, so this should be Ok. If we see performance issues with 264 // these allocations, for HealthWatchdog, we can always use placement new + noop deleter to 265 // avoid heap allocation for HealthWatchdog. 266 return std::make_unique<HealthWatchdog<HealthMonitorT>>( 267 mHealthMonitor, std::move(mMetadata), std::move(mOnHangCallback), mTimeoutMs); 268 } 269 270 private: 271 HealthMonitorT* mHealthMonitor; 272 std::unique_ptr<EventHangMetadata> mMetadata; 273 uint32_t mTimeoutMs; 274 std::optional<std::function<std::unique_ptr<HangAnnotations>()>> mOnHangCallback; 275 }; 276 277 std::unique_ptr<HealthMonitor<>> CreateHealthMonitor( 278 HealthMonitorConsumer& consumer, uint64_t heartbeatInterval = kDefaultIntervalMs); 279 280 } // namespace guest 281 } // namespace gfxstream 282