• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021, The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Can be removed when instrumentations are added to keystore.
16 #![allow(dead_code)]
17 
18 //! This module implements a watchdog thread.
19 
20 use std::{
21     cmp::min,
22     collections::HashMap,
23     sync::Arc,
24     sync::{Condvar, Mutex, MutexGuard},
25     thread,
26 };
27 use std::{
28     marker::PhantomData,
29     time::{Duration, Instant},
30 };
31 
32 #[cfg(test)]
33 mod tests;
34 
35 /// Represents a Watchdog record. It can be created with `Watchdog::watch` or
36 /// `Watchdog::watch_with`. It disarms the record when dropped.
37 pub struct WatchPoint {
38     id: &'static str,
39     wd: Arc<Watchdog>,
40     not_send: PhantomData<*mut ()>, // WatchPoint must not be Send.
41 }
42 
43 impl Drop for WatchPoint {
drop(&mut self)44     fn drop(&mut self) {
45         self.wd.disarm(self.id)
46     }
47 }
48 
49 #[derive(Debug, PartialEq, Eq)]
50 enum State {
51     NotRunning,
52     Running,
53 }
54 
55 #[derive(Debug, Clone, Hash, PartialEq, Eq)]
56 struct Index {
57     tid: thread::ThreadId,
58     id: &'static str,
59 }
60 
61 struct Record {
62     started: Instant,
63     deadline: Instant,
64     context: Option<Box<dyn std::fmt::Debug + Send + 'static>>,
65 }
66 
67 impl Record {
68     // Return a string representation of the start time of the record.
69     //
70     // Times are hard. This may not be accurate (e.g. if the system clock has been modified since
71     // the watchdog started), but it's _really_ useful to get a starting wall time for overrunning
72     // watchdogs.
started_utc(&self) -> String73     fn started_utc(&self) -> String {
74         let started_utc = chrono::Utc::now() - self.started.elapsed();
75         format!("{}", started_utc.format("%m-%d %H:%M:%S%.3f UTC"))
76     }
77 }
78 
79 struct WatchdogState {
80     state: State,
81     thread: Option<thread::JoinHandle<()>>,
82     /// How long to wait before dropping the watchdog thread when idle.
83     idle_timeout: Duration,
84     records: HashMap<Index, Record>,
85     last_report: Option<Instant>,
86     noisy_timeout: Duration,
87 }
88 
89 impl WatchdogState {
90     /// If we have overdue records, we want to log them but slowly backoff
91     /// so that we do not clog the logs. We start with logs every
92     /// `MIN_REPORT_TIMEOUT` sec then increment the timeout by 5 up
93     /// to a maximum of `MAX_REPORT_TIMEOUT`.
94     const MIN_REPORT_TIMEOUT: Duration = Duration::from_secs(1);
95     const MAX_REPORT_TIMEOUT: Duration = Duration::from_secs(30);
96 
reset_noisy_timeout(&mut self)97     fn reset_noisy_timeout(&mut self) {
98         self.noisy_timeout = Self::MIN_REPORT_TIMEOUT;
99     }
100 
update_noisy_timeout(&mut self)101     fn update_noisy_timeout(&mut self) {
102         let noisy_update = self.noisy_timeout + Duration::from_secs(5);
103         self.noisy_timeout = min(Self::MAX_REPORT_TIMEOUT, noisy_update);
104     }
105 
overdue_and_next_timeout(&self) -> (bool, Option<Duration>)106     fn overdue_and_next_timeout(&self) -> (bool, Option<Duration>) {
107         let now = Instant::now();
108         let mut next_timeout: Option<Duration> = None;
109         let mut has_overdue = false;
110         for (_, r) in self.records.iter() {
111             let timeout = r.deadline.saturating_duration_since(now);
112             if timeout == Duration::new(0, 0) {
113                 // This timeout has passed.
114                 has_overdue = true;
115             } else {
116                 // This timeout is still to come; see if it's the closest one to now.
117                 next_timeout = match next_timeout {
118                     Some(nt) if timeout < nt => Some(timeout),
119                     Some(nt) => Some(nt),
120                     None => Some(timeout),
121                 };
122             }
123         }
124         (has_overdue, next_timeout)
125     }
126 
log_report(&mut self, has_overdue: bool)127     fn log_report(&mut self, has_overdue: bool) {
128         if !has_overdue {
129             // Nothing to report.
130             self.last_report = None;
131             return;
132         }
133         // Something to report...
134         if let Some(reported_at) = self.last_report {
135             if reported_at.elapsed() < self.noisy_timeout {
136                 // .. but it's too soon since the last report.
137                 self.last_report = None;
138                 return;
139             }
140         }
141         self.update_noisy_timeout();
142         self.last_report = Some(Instant::now());
143         log::warn!("### Keystore Watchdog report - BEGIN ###");
144 
145         let now = Instant::now();
146         let mut overdue_records: Vec<(&Index, &Record)> = self
147             .records
148             .iter()
149             .filter(|(_, r)| r.deadline.saturating_duration_since(now) == Duration::new(0, 0))
150             .collect();
151 
152         log::warn!(
153             concat!(
154                 "When extracting from a bug report, please include this header ",
155                 "and all {} records below (to footer)"
156             ),
157             overdue_records.len()
158         );
159 
160         // Watch points can be nested, i.e., a single thread may have multiple armed
161         // watch points. And the most recent on each thread (thread recent) is closest to the point
162         // where something is blocked. Furthermore, keystore2 has various critical section
163         // and common backend resources KeyMint that can only be entered serialized. So if one
164         // thread hangs, the others will soon follow suite. Thus the oldest "thread recent" watch
165         // point is most likely pointing toward the culprit.
166         // Thus, sort by start time first.
167         overdue_records.sort_unstable_by(|(_, r1), (_, r2)| r1.started.cmp(&r2.started));
168         // Then we groups all of the watch points per thread preserving the order within
169         // groups.
170         let groups = overdue_records.iter().fold(
171             HashMap::<thread::ThreadId, Vec<(&Index, &Record)>>::new(),
172             |mut acc, (i, r)| {
173                 acc.entry(i.tid).or_default().push((i, r));
174                 acc
175             },
176         );
177         // Put the groups back into a vector.
178         let mut groups: Vec<Vec<(&Index, &Record)>> = groups.into_values().collect();
179         // Sort the groups by start time of the most recent (.last()) of each group.
180         // It is panic safe to use unwrap() here because we never add empty vectors to
181         // the map.
182         groups.sort_by(|v1, v2| v1.last().unwrap().1.started.cmp(&v2.last().unwrap().1.started));
183 
184         for g in groups.iter() {
185             for (i, r) in g.iter() {
186                 match &r.context {
187                     Some(ctx) => {
188                         log::warn!(
189                             "{:?} {} Started: {} Pending: {:?} Overdue {:?} for {:?}",
190                             i.tid,
191                             i.id,
192                             r.started_utc(),
193                             r.started.elapsed(),
194                             r.deadline.elapsed(),
195                             ctx
196                         );
197                     }
198                     None => {
199                         log::warn!(
200                             "{:?} {} Started: {} Pending: {:?} Overdue {:?}",
201                             i.tid,
202                             i.id,
203                             r.started_utc(),
204                             r.started.elapsed(),
205                             r.deadline.elapsed()
206                         );
207                     }
208                 }
209             }
210         }
211         log::warn!("### Keystore Watchdog report - END ###");
212     }
213 
disarm(&mut self, index: Index)214     fn disarm(&mut self, index: Index) {
215         let result = self.records.remove(&index);
216         if let Some(record) = result {
217             let now = Instant::now();
218             let timeout_left = record.deadline.saturating_duration_since(now);
219             if timeout_left == Duration::new(0, 0) {
220                 match &record.context {
221                     Some(ctx) => log::info!(
222                         "Watchdog complete for: {:?} {} Started: {} Pending: {:?} Overdue {:?} for {:?}",
223                         index.tid,
224                         index.id,
225                         record.started_utc(),
226                         record.started.elapsed(),
227                         record.deadline.elapsed(),
228                         ctx
229                     ),
230                     None => log::info!(
231                         "Watchdog complete for: {:?} {} Started: {} Pending: {:?} Overdue {:?}",
232                         index.tid,
233                         index.id,
234                         record.started_utc(),
235                         record.started.elapsed(),
236                         record.deadline.elapsed()
237                     ),
238                 }
239             }
240         }
241     }
242 
arm(&mut self, index: Index, record: Record)243     fn arm(&mut self, index: Index, record: Record) {
244         if self.records.insert(index.clone(), record).is_some() {
245             log::warn!("Recursive watchdog record at \"{:?}\" replaces previous record.", index);
246         }
247     }
248 }
249 
250 /// Watchdog spawns a thread that logs records of all overdue watch points when a deadline
251 /// is missed and at least every second as long as overdue watch points exist.
252 /// The thread terminates when idle for a given period of time.
253 pub struct Watchdog {
254     state: Arc<(Condvar, Mutex<WatchdogState>)>,
255 }
256 
257 impl Watchdog {
258     /// Construct a [`Watchdog`]. When `idle_timeout` has elapsed since the watchdog thread became
259     /// idle, i.e., there are no more active or overdue watch points, the watchdog thread
260     /// terminates.
new(idle_timeout: Duration) -> Arc<Self>261     pub fn new(idle_timeout: Duration) -> Arc<Self> {
262         Arc::new(Self {
263             state: Arc::new((
264                 Condvar::new(),
265                 Mutex::new(WatchdogState {
266                     state: State::NotRunning,
267                     thread: None,
268                     idle_timeout,
269                     records: HashMap::new(),
270                     last_report: None,
271                     noisy_timeout: WatchdogState::MIN_REPORT_TIMEOUT,
272                 }),
273             )),
274         })
275     }
276 
watch_with_optional( wd: Arc<Self>, context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, id: &'static str, timeout: Duration, ) -> Option<WatchPoint>277     fn watch_with_optional(
278         wd: Arc<Self>,
279         context: Option<Box<dyn std::fmt::Debug + Send + 'static>>,
280         id: &'static str,
281         timeout: Duration,
282     ) -> Option<WatchPoint> {
283         let Some(deadline) = Instant::now().checked_add(timeout) else {
284             log::warn!("Deadline computation failed for WatchPoint \"{}\"", id);
285             log::warn!("WatchPoint not armed.");
286             return None;
287         };
288         wd.arm(context, id, deadline);
289         Some(WatchPoint { id, wd, not_send: Default::default() })
290     }
291 
292     /// Create a new watch point. If the WatchPoint is not dropped before the timeout
293     /// expires, a report is logged at least every second, which includes the id string
294     /// and any provided context.
watch_with( wd: &Arc<Self>, id: &'static str, timeout: Duration, context: impl std::fmt::Debug + Send + 'static, ) -> Option<WatchPoint>295     pub fn watch_with(
296         wd: &Arc<Self>,
297         id: &'static str,
298         timeout: Duration,
299         context: impl std::fmt::Debug + Send + 'static,
300     ) -> Option<WatchPoint> {
301         Self::watch_with_optional(wd.clone(), Some(Box::new(context)), id, timeout)
302     }
303 
304     /// Like `watch_with`, but without context.
watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint>305     pub fn watch(wd: &Arc<Self>, id: &'static str, timeout: Duration) -> Option<WatchPoint> {
306         Self::watch_with_optional(wd.clone(), None, id, timeout)
307     }
308 
arm( &self, context: Option<Box<dyn std::fmt::Debug + Send + 'static>>, id: &'static str, deadline: Instant, )309     fn arm(
310         &self,
311         context: Option<Box<dyn std::fmt::Debug + Send + 'static>>,
312         id: &'static str,
313         deadline: Instant,
314     ) {
315         let tid = thread::current().id();
316         let index = Index { tid, id };
317         let record = Record { started: Instant::now(), deadline, context };
318 
319         let (ref condvar, ref state) = *self.state;
320 
321         let mut state = state.lock().unwrap();
322         state.arm(index, record);
323 
324         if state.state != State::Running {
325             self.spawn_thread(&mut state);
326         }
327         drop(state);
328         condvar.notify_all();
329     }
330 
disarm(&self, id: &'static str)331     fn disarm(&self, id: &'static str) {
332         let tid = thread::current().id();
333         let index = Index { tid, id };
334         let (_, ref state) = *self.state;
335 
336         let mut state = state.lock().unwrap();
337         state.disarm(index);
338         // There is no need to notify condvar. There is no action required for the
339         // watchdog thread before the next deadline.
340     }
341 
spawn_thread(&self, state: &mut MutexGuard<WatchdogState>)342     fn spawn_thread(&self, state: &mut MutexGuard<WatchdogState>) {
343         if let Some(t) = state.thread.take() {
344             t.join().expect("Watchdog thread panicked.");
345         }
346 
347         let cloned_state = self.state.clone();
348 
349         state.thread = Some(thread::spawn(move || {
350             let (ref condvar, ref state) = *cloned_state;
351 
352             let mut state = state.lock().unwrap();
353 
354             loop {
355                 let (has_overdue, next_timeout) = state.overdue_and_next_timeout();
356                 state.log_report(has_overdue);
357 
358                 let (next_timeout, idle) = match (has_overdue, next_timeout) {
359                     (true, Some(next_timeout)) => (min(next_timeout, state.noisy_timeout), false),
360                     (true, None) => (state.noisy_timeout, false),
361                     (false, Some(next_timeout)) => (next_timeout, false),
362                     (false, None) => (state.idle_timeout, true),
363                 };
364 
365                 // Wait until the closest timeout pops, but use a condition variable so that if a
366                 // new watchpoint is started in the meanwhile it will interrupt the wait so we can
367                 // recalculate.
368                 let (s, timeout) = condvar.wait_timeout(state, next_timeout).unwrap();
369                 state = s;
370 
371                 if idle && timeout.timed_out() && state.records.is_empty() {
372                     state.reset_noisy_timeout();
373                     state.state = State::NotRunning;
374                     break;
375                 }
376             }
377             log::info!("Watchdog thread idle -> terminating. Have a great day.");
378         }));
379         state.state = State::Running;
380     }
381 }
382