1 // Copyright 2021, The Android Open Source Project 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Can be removed when instrumentations are added to keystore. 16 #![allow(dead_code)] 17 18 //! This module implements a watchdog thread. 19 20 use std::{ 21 cmp::min, 22 collections::HashMap, 23 sync::Arc, 24 sync::{Condvar, Mutex, MutexGuard}, 25 thread, 26 }; 27 use std::{ 28 marker::PhantomData, 29 time::{Duration, Instant}, 30 }; 31 32 /// Represents a Watchdog record. It can be created with `Watchdog::watch` or 33 /// `Watchdog::watch_with`. It disarms the record when dropped. 34 pub struct WatchPoint { 35 id: &'static str, 36 wd: Arc<Watchdog>, 37 not_send: PhantomData<*mut ()>, // WatchPoint must not be Send. 38 } 39 40 impl Drop for WatchPoint { drop(&mut self)41 fn drop(&mut self) { 42 self.wd.disarm(self.id) 43 } 44 } 45 46 #[derive(Debug, PartialEq, Eq)] 47 enum State { 48 NotRunning, 49 Running, 50 } 51 52 #[derive(Debug, Clone, Hash, PartialEq, Eq)] 53 struct Index { 54 tid: thread::ThreadId, 55 id: &'static str, 56 } 57 58 struct Record { 59 started: Instant, 60 deadline: Instant, 61 callback: Option<Box<dyn Fn() -> String + Send + 'static>>, 62 } 63 64 struct WatchdogState { 65 state: State, 66 thread: Option<thread::JoinHandle<()>>, 67 timeout: Duration, 68 records: HashMap<Index, Record>, 69 last_report: Instant, 70 has_overdue: bool, 71 } 72 73 impl WatchdogState { update_overdue_and_find_next_timeout(&mut self) -> (bool, Option<Duration>)74 fn update_overdue_and_find_next_timeout(&mut self) -> (bool, Option<Duration>) { 75 let now = Instant::now(); 76 let mut next_timeout: Option<Duration> = None; 77 let mut has_overdue = false; 78 for (_, r) in self.records.iter() { 79 let timeout = r.deadline.saturating_duration_since(now); 80 if timeout == Duration::new(0, 0) { 81 has_overdue = true; 82 continue; 83 } 84 next_timeout = match next_timeout { 85 Some(nt) => { 86 if timeout < nt { 87 Some(timeout) 88 } else { 89 Some(nt) 90 } 91 } 92 None => Some(timeout), 93 }; 94 } 95 (has_overdue, next_timeout) 96 } 97 log_report(&mut self, has_overdue: bool) -> bool98 fn log_report(&mut self, has_overdue: bool) -> bool { 99 match (self.has_overdue, has_overdue) { 100 (true, true) => { 101 if self.last_report.elapsed() < Watchdog::NOISY_REPORT_TIMEOUT { 102 self.has_overdue = false; 103 return false; 104 } 105 } 106 (_, false) => { 107 self.has_overdue = false; 108 return false; 109 } 110 (false, true) => {} 111 } 112 self.last_report = Instant::now(); 113 self.has_overdue = has_overdue; 114 log::warn!("### Keystore Watchdog report - BEGIN ###"); 115 116 let now = Instant::now(); 117 let mut overdue_records: Vec<(&Index, &Record)> = self 118 .records 119 .iter() 120 .filter(|(_, r)| r.deadline.saturating_duration_since(now) == Duration::new(0, 0)) 121 .collect(); 122 123 log::warn!("When extracting from a bug report, please include this header"); 124 log::warn!("and all {} records below.", overdue_records.len()); 125 126 // Watch points can be nested, i.e., a single thread may have multiple armed 127 // watch points. And the most recent on each thread (thread recent) is closest to the point 128 // where something is blocked. Furthermore, keystore2 has various critical section 129 // and common backend resources KeyMint that can only be entered serialized. So if one 130 // thread hangs, the others will soon follow suite. Thus the oldest "thread recent" watch 131 // point is most likely pointing toward the culprit. 132 // Thus, sort by start time first. 133 overdue_records.sort_unstable_by(|(_, r1), (_, r2)| r1.started.cmp(&r2.started)); 134 // Then we groups all of the watch points per thread preserving the order within 135 // groups. 136 let groups = overdue_records.iter().fold( 137 HashMap::<thread::ThreadId, Vec<(&Index, &Record)>>::new(), 138 |mut acc, (i, r)| { 139 acc.entry(i.tid).or_default().push((i, r)); 140 acc 141 }, 142 ); 143 // Put the groups back into a vector. 144 let mut groups: Vec<Vec<(&Index, &Record)>> = groups.into_values().collect(); 145 // Sort the groups by start time of the most recent (.last()) of each group. 146 // It is panic safe to use unwrap() here because we never add empty vectors to 147 // the map. 148 groups.sort_by(|v1, v2| v1.last().unwrap().1.started.cmp(&v2.last().unwrap().1.started)); 149 150 for g in groups.iter() { 151 for (i, r) in g.iter() { 152 match &r.callback { 153 Some(cb) => { 154 log::warn!( 155 "{:?} {} Pending: {:?} Overdue {:?}: {}", 156 i.tid, 157 i.id, 158 r.started.elapsed(), 159 r.deadline.elapsed(), 160 (cb)() 161 ); 162 } 163 None => { 164 log::warn!( 165 "{:?} {} Pending: {:?} Overdue {:?}", 166 i.tid, 167 i.id, 168 r.started.elapsed(), 169 r.deadline.elapsed() 170 ); 171 } 172 } 173 } 174 } 175 log::warn!("### Keystore Watchdog report - END ###"); 176 true 177 } 178 disarm(&mut self, index: Index)179 fn disarm(&mut self, index: Index) { 180 self.records.remove(&index); 181 } 182 arm(&mut self, index: Index, record: Record)183 fn arm(&mut self, index: Index, record: Record) { 184 if self.records.insert(index.clone(), record).is_some() { 185 log::warn!("Recursive watchdog record at \"{:?}\" replaces previous record.", index); 186 } 187 } 188 } 189 190 /// Watchdog spawns a thread that logs records of all overdue watch points when a deadline 191 /// is missed and at least every second as long as overdue watch points exist. 192 /// The thread terminates when idle for a given period of time. 193 pub struct Watchdog { 194 state: Arc<(Condvar, Mutex<WatchdogState>)>, 195 } 196 197 impl Watchdog { 198 /// If we have overdue records, we want to be noisy about it and log a report 199 /// at least every `NOISY_REPORT_TIMEOUT` interval. 200 const NOISY_REPORT_TIMEOUT: Duration = Duration::from_secs(1); 201 202 /// Construct a [`Watchdog`]. When `timeout` has elapsed since the watchdog thread became 203 /// idle, i.e., there are no more active or overdue watch points, the watchdog thread 204 /// terminates. new(timeout: Duration) -> Arc<Self>205 pub fn new(timeout: Duration) -> Arc<Self> { 206 Arc::new(Self { 207 state: Arc::new(( 208 Condvar::new(), 209 Mutex::new(WatchdogState { 210 state: State::NotRunning, 211 thread: None, 212 timeout, 213 records: HashMap::new(), 214 last_report: Instant::now(), 215 has_overdue: false, 216 }), 217 )), 218 }) 219 } 220 watch_with_optional( wd: &Arc<Self>, callback: Option<Box<dyn Fn() -> String + Send + 'static>>, id: &'static str, timeout: Duration, ) -> Option<WatchPoint>221 fn watch_with_optional( 222 wd: &Arc<Self>, 223 callback: Option<Box<dyn Fn() -> String + Send + 'static>>, 224 id: &'static str, 225 timeout: Duration, 226 ) -> Option<WatchPoint> { 227 let deadline = Instant::now().checked_add(timeout); 228 if deadline.is_none() { 229 log::warn!("Deadline computation failed for WatchPoint \"{}\"", id); 230 log::warn!("WatchPoint not armed."); 231 return None; 232 } 233 wd.arm(callback, id, deadline.unwrap()); 234 Some(WatchPoint { id, wd: wd.clone(), not_send: Default::default() }) 235 } 236 237 /// Create a new watch point. If the WatchPoint is not dropped before the timeout 238 /// expires, a report is logged at least every second, which includes the id string 239 /// and whatever string the callback returns. watch_with( wd: &Arc<Self>, id: &'static str, timeout: Duration, callback: impl Fn() -> String + Send + 'static, ) -> Option<WatchPoint>240 pub fn watch_with( 241 wd: &Arc<Self>, 242 id: &'static str, 243 timeout: Duration, 244 callback: impl Fn() -> String + Send + 'static, 245 ) -> Option<WatchPoint> { 246 Self::watch_with_optional(wd, Some(Box::new(callback)), id, timeout) 247 } 248 249 /// Like `watch_with`, but without a callback. watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint>250 pub fn watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint> { 251 Self::watch_with_optional(wd, None, id, timeout) 252 } 253 arm( &self, callback: Option<Box<dyn Fn() -> String + Send + 'static>>, id: &'static str, deadline: Instant, )254 fn arm( 255 &self, 256 callback: Option<Box<dyn Fn() -> String + Send + 'static>>, 257 id: &'static str, 258 deadline: Instant, 259 ) { 260 let tid = thread::current().id(); 261 let index = Index { tid, id }; 262 let record = Record { started: Instant::now(), deadline, callback }; 263 264 let (ref condvar, ref state) = *self.state; 265 266 let mut state = state.lock().unwrap(); 267 state.arm(index, record); 268 269 if state.state != State::Running { 270 self.spawn_thread(&mut state); 271 } 272 drop(state); 273 condvar.notify_all(); 274 } 275 disarm(&self, id: &'static str)276 fn disarm(&self, id: &'static str) { 277 let tid = thread::current().id(); 278 let index = Index { tid, id }; 279 let (_, ref state) = *self.state; 280 281 let mut state = state.lock().unwrap(); 282 state.disarm(index); 283 // There is no need to notify condvar. There is no action required for the 284 // watchdog thread before the next deadline. 285 } 286 spawn_thread(&self, state: &mut MutexGuard<WatchdogState>)287 fn spawn_thread(&self, state: &mut MutexGuard<WatchdogState>) { 288 if let Some(t) = state.thread.take() { 289 t.join().expect("Watchdog thread panicked."); 290 } 291 292 let cloned_state = self.state.clone(); 293 294 state.thread = Some(thread::spawn(move || { 295 let (ref condvar, ref state) = *cloned_state; 296 297 let mut state = state.lock().unwrap(); 298 299 loop { 300 let (has_overdue, next_timeout) = state.update_overdue_and_find_next_timeout(); 301 state.log_report(has_overdue); 302 let (next_timeout, idle) = match (has_overdue, next_timeout) { 303 (true, Some(next_timeout)) => { 304 (min(next_timeout, Self::NOISY_REPORT_TIMEOUT), false) 305 } 306 (false, Some(next_timeout)) => (next_timeout, false), 307 (true, None) => (Self::NOISY_REPORT_TIMEOUT, false), 308 (false, None) => (state.timeout, true), 309 }; 310 311 let (s, timeout) = condvar.wait_timeout(state, next_timeout).unwrap(); 312 state = s; 313 314 if idle && timeout.timed_out() && state.records.is_empty() { 315 state.state = State::NotRunning; 316 break; 317 } 318 } 319 log::info!("Watchdog thread idle -> terminating. Have a great day."); 320 })); 321 state.state = State::Running; 322 } 323 } 324 325 #[cfg(test)] 326 mod tests { 327 328 use super::*; 329 use std::sync::atomic; 330 use std::thread; 331 use std::time::Duration; 332 333 #[test] test_watchdog()334 fn test_watchdog() { 335 android_logger::init_once( 336 android_logger::Config::default() 337 .with_tag("keystore2_watchdog_tests") 338 .with_min_level(log::Level::Debug), 339 ); 340 341 let wd = Watchdog::new(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(3).unwrap()); 342 let hit_count = Arc::new(atomic::AtomicU8::new(0)); 343 let hit_count_clone = hit_count.clone(); 344 let wp = 345 Watchdog::watch_with(&wd, "test_watchdog", Duration::from_millis(100), move || { 346 format!("hit_count: {}", hit_count_clone.fetch_add(1, atomic::Ordering::Relaxed)) 347 }); 348 assert_eq!(0, hit_count.load(atomic::Ordering::Relaxed)); 349 thread::sleep(Duration::from_millis(500)); 350 assert_eq!(1, hit_count.load(atomic::Ordering::Relaxed)); 351 thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT); 352 assert_eq!(2, hit_count.load(atomic::Ordering::Relaxed)); 353 drop(wp); 354 thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(4).unwrap()); 355 assert_eq!(2, hit_count.load(atomic::Ordering::Relaxed)); 356 let (_, ref state) = *wd.state; 357 let state = state.lock().unwrap(); 358 assert_eq!(state.state, State::NotRunning); 359 } 360 } 361