1 // Copyright 2021, The Android Open Source Project 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Can be removed when instrumentations are added to keystore. 16 #![allow(dead_code)] 17 18 //! This module implements a watchdog thread. 19 20 use std::{ 21 cmp::min, 22 collections::HashMap, 23 sync::Arc, 24 sync::{Condvar, Mutex, MutexGuard}, 25 thread, 26 }; 27 use std::{ 28 marker::PhantomData, 29 time::{Duration, Instant}, 30 }; 31 32 #[cfg(test)] 33 mod tests; 34 35 /// Represents a Watchdog record. It can be created with `Watchdog::watch` or 36 /// `Watchdog::watch_with`. It disarms the record when dropped. 37 pub struct WatchPoint { 38 id: &'static str, 39 wd: Arc<Watchdog>, 40 not_send: PhantomData<*mut ()>, // WatchPoint must not be Send. 41 } 42 43 impl Drop for WatchPoint { drop(&mut self)44 fn drop(&mut self) { 45 self.wd.disarm(self.id) 46 } 47 } 48 49 #[derive(Debug, PartialEq, Eq)] 50 enum State { 51 NotRunning, 52 Running, 53 } 54 55 #[derive(Debug, Clone, Hash, PartialEq, Eq)] 56 struct Index { 57 tid: thread::ThreadId, 58 id: &'static str, 59 } 60 61 struct Record { 62 started: Instant, 63 deadline: Instant, 64 context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, 65 } 66 67 impl Record { 68 // Return a string representation of the start time of the record. 69 // 70 // Times are hard. This may not be accurate (e.g. if the system clock has been modified since 71 // the watchdog started), but it's _really_ useful to get a starting wall time for overrunning 72 // watchdogs. started_utc(&self) -> String73 fn started_utc(&self) -> String { 74 let started_utc = chrono::Utc::now() - self.started.elapsed(); 75 format!("{}", started_utc.format("%m-%d %H:%M:%S%.3f UTC")) 76 } 77 } 78 79 struct WatchdogState { 80 state: State, 81 thread: Option<thread::JoinHandle<()>>, 82 /// How long to wait before dropping the watchdog thread when idle. 83 idle_timeout: Duration, 84 records: HashMap<Index, Record>, 85 last_report: Option<Instant>, 86 noisy_timeout: Duration, 87 } 88 89 impl WatchdogState { 90 /// If we have overdue records, we want to log them but slowly backoff 91 /// so that we do not clog the logs. We start with logs every 92 /// `MIN_REPORT_TIMEOUT` sec then increment the timeout by 5 up 93 /// to a maximum of `MAX_REPORT_TIMEOUT`. 94 const MIN_REPORT_TIMEOUT: Duration = Duration::from_secs(1); 95 const MAX_REPORT_TIMEOUT: Duration = Duration::from_secs(30); 96 reset_noisy_timeout(&mut self)97 fn reset_noisy_timeout(&mut self) { 98 self.noisy_timeout = Self::MIN_REPORT_TIMEOUT; 99 } 100 update_noisy_timeout(&mut self)101 fn update_noisy_timeout(&mut self) { 102 let noisy_update = self.noisy_timeout + Duration::from_secs(5); 103 self.noisy_timeout = min(Self::MAX_REPORT_TIMEOUT, noisy_update); 104 } 105 overdue_and_next_timeout(&self) -> (bool, Option<Duration>)106 fn overdue_and_next_timeout(&self) -> (bool, Option<Duration>) { 107 let now = Instant::now(); 108 let mut next_timeout: Option<Duration> = None; 109 let mut has_overdue = false; 110 for (_, r) in self.records.iter() { 111 let timeout = r.deadline.saturating_duration_since(now); 112 if timeout == Duration::new(0, 0) { 113 // This timeout has passed. 114 has_overdue = true; 115 } else { 116 // This timeout is still to come; see if it's the closest one to now. 117 next_timeout = match next_timeout { 118 Some(nt) if timeout < nt => Some(timeout), 119 Some(nt) => Some(nt), 120 None => Some(timeout), 121 }; 122 } 123 } 124 (has_overdue, next_timeout) 125 } 126 log_report(&mut self, has_overdue: bool)127 fn log_report(&mut self, has_overdue: bool) { 128 if !has_overdue { 129 // Nothing to report. 130 self.last_report = None; 131 return; 132 } 133 // Something to report... 134 if let Some(reported_at) = self.last_report { 135 if reported_at.elapsed() < self.noisy_timeout { 136 // .. but it's too soon since the last report. 137 self.last_report = None; 138 return; 139 } 140 } 141 self.update_noisy_timeout(); 142 self.last_report = Some(Instant::now()); 143 log::warn!("### Keystore Watchdog report - BEGIN ###"); 144 145 let now = Instant::now(); 146 let mut overdue_records: Vec<(&Index, &Record)> = self 147 .records 148 .iter() 149 .filter(|(_, r)| r.deadline.saturating_duration_since(now) == Duration::new(0, 0)) 150 .collect(); 151 152 log::warn!( 153 concat!( 154 "When extracting from a bug report, please include this header ", 155 "and all {} records below (to footer)" 156 ), 157 overdue_records.len() 158 ); 159 160 // Watch points can be nested, i.e., a single thread may have multiple armed 161 // watch points. And the most recent on each thread (thread recent) is closest to the point 162 // where something is blocked. Furthermore, keystore2 has various critical section 163 // and common backend resources KeyMint that can only be entered serialized. So if one 164 // thread hangs, the others will soon follow suite. Thus the oldest "thread recent" watch 165 // point is most likely pointing toward the culprit. 166 // Thus, sort by start time first. 167 overdue_records.sort_unstable_by(|(_, r1), (_, r2)| r1.started.cmp(&r2.started)); 168 // Then we groups all of the watch points per thread preserving the order within 169 // groups. 170 let groups = overdue_records.iter().fold( 171 HashMap::<thread::ThreadId, Vec<(&Index, &Record)>>::new(), 172 |mut acc, (i, r)| { 173 acc.entry(i.tid).or_default().push((i, r)); 174 acc 175 }, 176 ); 177 // Put the groups back into a vector. 178 let mut groups: Vec<Vec<(&Index, &Record)>> = groups.into_values().collect(); 179 // Sort the groups by start time of the most recent (.last()) of each group. 180 // It is panic safe to use unwrap() here because we never add empty vectors to 181 // the map. 182 groups.sort_by(|v1, v2| v1.last().unwrap().1.started.cmp(&v2.last().unwrap().1.started)); 183 184 for g in groups.iter() { 185 for (i, r) in g.iter() { 186 match &r.context { 187 Some(ctx) => { 188 log::warn!( 189 "{:?} {} Started: {} Pending: {:?} Overdue {:?} for {:?}", 190 i.tid, 191 i.id, 192 r.started_utc(), 193 r.started.elapsed(), 194 r.deadline.elapsed(), 195 ctx 196 ); 197 } 198 None => { 199 log::warn!( 200 "{:?} {} Started: {} Pending: {:?} Overdue {:?}", 201 i.tid, 202 i.id, 203 r.started_utc(), 204 r.started.elapsed(), 205 r.deadline.elapsed() 206 ); 207 } 208 } 209 } 210 } 211 log::warn!("### Keystore Watchdog report - END ###"); 212 } 213 disarm(&mut self, index: Index)214 fn disarm(&mut self, index: Index) { 215 let result = self.records.remove(&index); 216 if let Some(record) = result { 217 let now = Instant::now(); 218 let timeout_left = record.deadline.saturating_duration_since(now); 219 if timeout_left == Duration::new(0, 0) { 220 match &record.context { 221 Some(ctx) => log::info!( 222 "Watchdog complete for: {:?} {} Started: {} Pending: {:?} Overdue {:?} for {:?}", 223 index.tid, 224 index.id, 225 record.started_utc(), 226 record.started.elapsed(), 227 record.deadline.elapsed(), 228 ctx 229 ), 230 None => log::info!( 231 "Watchdog complete for: {:?} {} Started: {} Pending: {:?} Overdue {:?}", 232 index.tid, 233 index.id, 234 record.started_utc(), 235 record.started.elapsed(), 236 record.deadline.elapsed() 237 ), 238 } 239 } 240 } 241 } 242 arm(&mut self, index: Index, record: Record)243 fn arm(&mut self, index: Index, record: Record) { 244 if self.records.insert(index.clone(), record).is_some() { 245 log::warn!("Recursive watchdog record at \"{:?}\" replaces previous record.", index); 246 } 247 } 248 } 249 250 /// Watchdog spawns a thread that logs records of all overdue watch points when a deadline 251 /// is missed and at least every second as long as overdue watch points exist. 252 /// The thread terminates when idle for a given period of time. 253 pub struct Watchdog { 254 state: Arc<(Condvar, Mutex<WatchdogState>)>, 255 } 256 257 impl Watchdog { 258 /// Construct a [`Watchdog`]. When `idle_timeout` has elapsed since the watchdog thread became 259 /// idle, i.e., there are no more active or overdue watch points, the watchdog thread 260 /// terminates. new(idle_timeout: Duration) -> Arc<Self>261 pub fn new(idle_timeout: Duration) -> Arc<Self> { 262 Arc::new(Self { 263 state: Arc::new(( 264 Condvar::new(), 265 Mutex::new(WatchdogState { 266 state: State::NotRunning, 267 thread: None, 268 idle_timeout, 269 records: HashMap::new(), 270 last_report: None, 271 noisy_timeout: WatchdogState::MIN_REPORT_TIMEOUT, 272 }), 273 )), 274 }) 275 } 276 watch_with_optional( wd: Arc<Self>, context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, id: &'static str, timeout: Duration, ) -> Option<WatchPoint>277 fn watch_with_optional( 278 wd: Arc<Self>, 279 context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, 280 id: &'static str, 281 timeout: Duration, 282 ) -> Option<WatchPoint> { 283 let Some(deadline) = Instant::now().checked_add(timeout) else { 284 log::warn!("Deadline computation failed for WatchPoint \"{}\"", id); 285 log::warn!("WatchPoint not armed."); 286 return None; 287 }; 288 wd.arm(context, id, deadline); 289 Some(WatchPoint { id, wd, not_send: Default::default() }) 290 } 291 292 /// Create a new watch point. If the WatchPoint is not dropped before the timeout 293 /// expires, a report is logged at least every second, which includes the id string 294 /// and any provided context. watch_with( wd: &Arc<Self>, id: &'static str, timeout: Duration, context: impl std::fmt::Debug + Send + 'static, ) -> Option<WatchPoint>295 pub fn watch_with( 296 wd: &Arc<Self>, 297 id: &'static str, 298 timeout: Duration, 299 context: impl std::fmt::Debug + Send + 'static, 300 ) -> Option<WatchPoint> { 301 Self::watch_with_optional(wd.clone(), Some(Box::new(context)), id, timeout) 302 } 303 304 /// Like `watch_with`, but without context. watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint>305 pub fn watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint> { 306 Self::watch_with_optional(wd.clone(), None, id, timeout) 307 } 308 arm( &self, context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, id: &'static str, deadline: Instant, )309 fn arm( 310 &self, 311 context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, 312 id: &'static str, 313 deadline: Instant, 314 ) { 315 let tid = thread::current().id(); 316 let index = Index { tid, id }; 317 let record = Record { started: Instant::now(), deadline, context }; 318 319 let (ref condvar, ref state) = *self.state; 320 321 let mut state = state.lock().unwrap(); 322 state.arm(index, record); 323 324 if state.state != State::Running { 325 self.spawn_thread(&mut state); 326 } 327 drop(state); 328 condvar.notify_all(); 329 } 330 disarm(&self, id: &'static str)331 fn disarm(&self, id: &'static str) { 332 let tid = thread::current().id(); 333 let index = Index { tid, id }; 334 let (_, ref state) = *self.state; 335 336 let mut state = state.lock().unwrap(); 337 state.disarm(index); 338 // There is no need to notify condvar. There is no action required for the 339 // watchdog thread before the next deadline. 340 } 341 spawn_thread(&self, state: &mut MutexGuard<WatchdogState>)342 fn spawn_thread(&self, state: &mut MutexGuard<WatchdogState>) { 343 if let Some(t) = state.thread.take() { 344 t.join().expect("Watchdog thread panicked."); 345 } 346 347 let cloned_state = self.state.clone(); 348 349 state.thread = Some(thread::spawn(move || { 350 let (ref condvar, ref state) = *cloned_state; 351 352 let mut state = state.lock().unwrap(); 353 354 loop { 355 let (has_overdue, next_timeout) = state.overdue_and_next_timeout(); 356 state.log_report(has_overdue); 357 358 let (next_timeout, idle) = match (has_overdue, next_timeout) { 359 (true, Some(next_timeout)) => (min(next_timeout, state.noisy_timeout), false), 360 (true, None) => (state.noisy_timeout, false), 361 (false, Some(next_timeout)) => (next_timeout, false), 362 (false, None) => (state.idle_timeout, true), 363 }; 364 365 // Wait until the closest timeout pops, but use a condition variable so that if a 366 // new watchpoint is started in the meanwhile it will interrupt the wait so we can 367 // recalculate. 368 let (s, timeout) = condvar.wait_timeout(state, next_timeout).unwrap(); 369 state = s; 370 371 if idle && timeout.timed_out() && state.records.is_empty() { 372 state.reset_noisy_timeout(); 373 state.state = State::NotRunning; 374 break; 375 } 376 } 377 log::info!("Watchdog thread idle -> terminating. Have a great day."); 378 })); 379 state.state = State::Running; 380 } 381 } 382