• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021, The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Can be removed when instrumentations are added to keystore.
16 #![allow(dead_code)]
17 
18 //! This module implements a watchdog thread.
19 
20 use std::{
21     cmp::min,
22     collections::HashMap,
23     sync::Arc,
24     sync::{Condvar, Mutex, MutexGuard},
25     thread,
26 };
27 use std::{
28     marker::PhantomData,
29     time::{Duration, Instant},
30 };
31 
32 /// Represents a Watchdog record. It can be created with `Watchdog::watch` or
33 /// `Watchdog::watch_with`. It disarms the record when dropped.
34 pub struct WatchPoint {
35     id: &'static str,
36     wd: Arc<Watchdog>,
37     not_send: PhantomData<*mut ()>, // WatchPoint must not be Send.
38 }
39 
40 impl Drop for WatchPoint {
drop(&mut self)41     fn drop(&mut self) {
42         self.wd.disarm(self.id)
43     }
44 }
45 
46 #[derive(Debug, PartialEq, Eq)]
47 enum State {
48     NotRunning,
49     Running,
50 }
51 
52 #[derive(Debug, Clone, Hash, PartialEq, Eq)]
53 struct Index {
54     tid: thread::ThreadId,
55     id: &'static str,
56 }
57 
58 struct Record {
59     started: Instant,
60     deadline: Instant,
61     callback: Option<Box<dyn Fn() -> String + Send + 'static>>,
62 }
63 
64 struct WatchdogState {
65     state: State,
66     thread: Option<thread::JoinHandle<()>>,
67     timeout: Duration,
68     records: HashMap<Index, Record>,
69     last_report: Instant,
70     has_overdue: bool,
71 }
72 
73 impl WatchdogState {
update_overdue_and_find_next_timeout(&mut self) -> (bool, Option<Duration>)74     fn update_overdue_and_find_next_timeout(&mut self) -> (bool, Option<Duration>) {
75         let now = Instant::now();
76         let mut next_timeout: Option<Duration> = None;
77         let mut has_overdue = false;
78         for (_, r) in self.records.iter() {
79             let timeout = r.deadline.saturating_duration_since(now);
80             if timeout == Duration::new(0, 0) {
81                 has_overdue = true;
82                 continue;
83             }
84             next_timeout = match next_timeout {
85                 Some(nt) => {
86                     if timeout < nt {
87                         Some(timeout)
88                     } else {
89                         Some(nt)
90                     }
91                 }
92                 None => Some(timeout),
93             };
94         }
95         (has_overdue, next_timeout)
96     }
97 
log_report(&mut self, has_overdue: bool) -> bool98     fn log_report(&mut self, has_overdue: bool) -> bool {
99         match (self.has_overdue, has_overdue) {
100             (true, true) => {
101                 if self.last_report.elapsed() < Watchdog::NOISY_REPORT_TIMEOUT {
102                     self.has_overdue = false;
103                     return false;
104                 }
105             }
106             (_, false) => {
107                 self.has_overdue = false;
108                 return false;
109             }
110             (false, true) => {}
111         }
112         self.last_report = Instant::now();
113         self.has_overdue = has_overdue;
114         log::warn!("### Keystore Watchdog report - BEGIN ###");
115 
116         let now = Instant::now();
117         let mut overdue_records: Vec<(&Index, &Record)> = self
118             .records
119             .iter()
120             .filter(|(_, r)| r.deadline.saturating_duration_since(now) == Duration::new(0, 0))
121             .collect();
122 
123         log::warn!("When extracting from a bug report, please include this header");
124         log::warn!("and all {} records below.", overdue_records.len());
125 
126         // Watch points can be nested, i.e., a single thread may have multiple armed
127         // watch points. And the most recent on each thread (thread recent) is closest to the point
128         // where something is blocked. Furthermore, keystore2 has various critical section
129         // and common backend resources KeyMint that can only be entered serialized. So if one
130         // thread hangs, the others will soon follow suite. Thus the oldest "thread recent" watch
131         // point is most likely pointing toward the culprit.
132         // Thus, sort by start time first.
133         overdue_records.sort_unstable_by(|(_, r1), (_, r2)| r1.started.cmp(&r2.started));
134         // Then we groups all of the watch points per thread preserving the order within
135         // groups.
136         let groups = overdue_records.iter().fold(
137             HashMap::<thread::ThreadId, Vec<(&Index, &Record)>>::new(),
138             |mut acc, (i, r)| {
139                 acc.entry(i.tid).or_default().push((i, r));
140                 acc
141             },
142         );
143         // Put the groups back into a vector.
144         let mut groups: Vec<Vec<(&Index, &Record)>> = groups.into_values().collect();
145         // Sort the groups by start time of the most recent (.last()) of each group.
146         // It is panic safe to use unwrap() here because we never add empty vectors to
147         // the map.
148         groups.sort_by(|v1, v2| v1.last().unwrap().1.started.cmp(&v2.last().unwrap().1.started));
149 
150         for g in groups.iter() {
151             for (i, r) in g.iter() {
152                 match &r.callback {
153                     Some(cb) => {
154                         log::warn!(
155                             "{:?} {} Pending: {:?} Overdue {:?}: {}",
156                             i.tid,
157                             i.id,
158                             r.started.elapsed(),
159                             r.deadline.elapsed(),
160                             (cb)()
161                         );
162                     }
163                     None => {
164                         log::warn!(
165                             "{:?} {} Pending: {:?} Overdue {:?}",
166                             i.tid,
167                             i.id,
168                             r.started.elapsed(),
169                             r.deadline.elapsed()
170                         );
171                     }
172                 }
173             }
174         }
175         log::warn!("### Keystore Watchdog report - END ###");
176         true
177     }
178 
disarm(&mut self, index: Index)179     fn disarm(&mut self, index: Index) {
180         self.records.remove(&index);
181     }
182 
arm(&mut self, index: Index, record: Record)183     fn arm(&mut self, index: Index, record: Record) {
184         if self.records.insert(index.clone(), record).is_some() {
185             log::warn!("Recursive watchdog record at \"{:?}\" replaces previous record.", index);
186         }
187     }
188 }
189 
190 /// Watchdog spawns a thread that logs records of all overdue watch points when a deadline
191 /// is missed and at least every second as long as overdue watch points exist.
192 /// The thread terminates when idle for a given period of time.
193 pub struct Watchdog {
194     state: Arc<(Condvar, Mutex<WatchdogState>)>,
195 }
196 
197 impl Watchdog {
198     /// If we have overdue records, we want to be noisy about it and log a report
199     /// at least every `NOISY_REPORT_TIMEOUT` interval.
200     const NOISY_REPORT_TIMEOUT: Duration = Duration::from_secs(1);
201 
202     /// Construct a [`Watchdog`]. When `timeout` has elapsed since the watchdog thread became
203     /// idle, i.e., there are no more active or overdue watch points, the watchdog thread
204     /// terminates.
new(timeout: Duration) -> Arc<Self>205     pub fn new(timeout: Duration) -> Arc<Self> {
206         Arc::new(Self {
207             state: Arc::new((
208                 Condvar::new(),
209                 Mutex::new(WatchdogState {
210                     state: State::NotRunning,
211                     thread: None,
212                     timeout,
213                     records: HashMap::new(),
214                     last_report: Instant::now(),
215                     has_overdue: false,
216                 }),
217             )),
218         })
219     }
220 
watch_with_optional( wd: &Arc<Self>, callback: Option<Box<dyn Fn() -> String + Send + 'static>>, id: &'static str, timeout: Duration, ) -> Option<WatchPoint>221     fn watch_with_optional(
222         wd: &Arc<Self>,
223         callback: Option<Box<dyn Fn() -> String + Send + 'static>>,
224         id: &'static str,
225         timeout: Duration,
226     ) -> Option<WatchPoint> {
227         let deadline = Instant::now().checked_add(timeout);
228         if deadline.is_none() {
229             log::warn!("Deadline computation failed for WatchPoint \"{}\"", id);
230             log::warn!("WatchPoint not armed.");
231             return None;
232         }
233         wd.arm(callback, id, deadline.unwrap());
234         Some(WatchPoint { id, wd: wd.clone(), not_send: Default::default() })
235     }
236 
237     /// Create a new watch point. If the WatchPoint is not dropped before the timeout
238     /// expires, a report is logged at least every second, which includes the id string
239     /// and whatever string the callback returns.
watch_with( wd: &Arc<Self>, id: &'static str, timeout: Duration, callback: impl Fn() -> String + Send + 'static, ) -> Option<WatchPoint>240     pub fn watch_with(
241         wd: &Arc<Self>,
242         id: &'static str,
243         timeout: Duration,
244         callback: impl Fn() -> String + Send + 'static,
245     ) -> Option<WatchPoint> {
246         Self::watch_with_optional(wd, Some(Box::new(callback)), id, timeout)
247     }
248 
249     /// Like `watch_with`, but without a callback.
watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint>250     pub fn watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint> {
251         Self::watch_with_optional(wd, None, id, timeout)
252     }
253 
arm( &self, callback: Option<Box<dyn Fn() -> String + Send + 'static>>, id: &'static str, deadline: Instant, )254     fn arm(
255         &self,
256         callback: Option<Box<dyn Fn() -> String + Send + 'static>>,
257         id: &'static str,
258         deadline: Instant,
259     ) {
260         let tid = thread::current().id();
261         let index = Index { tid, id };
262         let record = Record { started: Instant::now(), deadline, callback };
263 
264         let (ref condvar, ref state) = *self.state;
265 
266         let mut state = state.lock().unwrap();
267         state.arm(index, record);
268 
269         if state.state != State::Running {
270             self.spawn_thread(&mut state);
271         }
272         drop(state);
273         condvar.notify_all();
274     }
275 
disarm(&self, id: &'static str)276     fn disarm(&self, id: &'static str) {
277         let tid = thread::current().id();
278         let index = Index { tid, id };
279         let (_, ref state) = *self.state;
280 
281         let mut state = state.lock().unwrap();
282         state.disarm(index);
283         // There is no need to notify condvar. There is no action required for the
284         // watchdog thread before the next deadline.
285     }
286 
spawn_thread(&self, state: &mut MutexGuard<WatchdogState>)287     fn spawn_thread(&self, state: &mut MutexGuard<WatchdogState>) {
288         if let Some(t) = state.thread.take() {
289             t.join().expect("Watchdog thread panicked.");
290         }
291 
292         let cloned_state = self.state.clone();
293 
294         state.thread = Some(thread::spawn(move || {
295             let (ref condvar, ref state) = *cloned_state;
296 
297             let mut state = state.lock().unwrap();
298 
299             loop {
300                 let (has_overdue, next_timeout) = state.update_overdue_and_find_next_timeout();
301                 state.log_report(has_overdue);
302                 let (next_timeout, idle) = match (has_overdue, next_timeout) {
303                     (true, Some(next_timeout)) => {
304                         (min(next_timeout, Self::NOISY_REPORT_TIMEOUT), false)
305                     }
306                     (false, Some(next_timeout)) => (next_timeout, false),
307                     (true, None) => (Self::NOISY_REPORT_TIMEOUT, false),
308                     (false, None) => (state.timeout, true),
309                 };
310 
311                 let (s, timeout) = condvar.wait_timeout(state, next_timeout).unwrap();
312                 state = s;
313 
314                 if idle && timeout.timed_out() && state.records.is_empty() {
315                     state.state = State::NotRunning;
316                     break;
317                 }
318             }
319             log::info!("Watchdog thread idle -> terminating. Have a great day.");
320         }));
321         state.state = State::Running;
322     }
323 }
324 
325 #[cfg(test)]
326 mod tests {
327 
328     use super::*;
329     use std::sync::atomic;
330     use std::thread;
331     use std::time::Duration;
332 
333     #[test]
test_watchdog()334     fn test_watchdog() {
335         android_logger::init_once(
336             android_logger::Config::default()
337                 .with_tag("keystore2_watchdog_tests")
338                 .with_min_level(log::Level::Debug),
339         );
340 
341         let wd = Watchdog::new(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(3).unwrap());
342         let hit_count = Arc::new(atomic::AtomicU8::new(0));
343         let hit_count_clone = hit_count.clone();
344         let wp =
345             Watchdog::watch_with(&wd, "test_watchdog", Duration::from_millis(100), move || {
346                 format!("hit_count: {}", hit_count_clone.fetch_add(1, atomic::Ordering::Relaxed))
347             });
348         assert_eq!(0, hit_count.load(atomic::Ordering::Relaxed));
349         thread::sleep(Duration::from_millis(500));
350         assert_eq!(1, hit_count.load(atomic::Ordering::Relaxed));
351         thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT);
352         assert_eq!(2, hit_count.load(atomic::Ordering::Relaxed));
353         drop(wp);
354         thread::sleep(Watchdog::NOISY_REPORT_TIMEOUT.checked_mul(4).unwrap());
355         assert_eq!(2, hit_count.load(atomic::Ordering::Relaxed));
356         let (_, ref state) = *wd.state;
357         let state = state.lock().unwrap();
358         assert_eq!(state.state, State::NotRunning);
359     }
360 }
361