• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! vmwdt is a virtual watchdog memory mapped device which detects stalls
6 //! on the vCPUs and resets the guest when no 'pet' events are received.
7 //! <https://docs.google.com/document/d/1DYmk2roxlwHZsOfcJi8xDMdWOHAmomvs2SDh7KPud3Y/edit?usp=sharing&resourcekey=0-oSNabc-t040a1q0K4cyI8Q>
8 
9 use std::convert::TryFrom;
10 use std::fs;
11 use std::io::Error as IoError;
12 use std::process;
13 use std::sync::Arc;
14 use std::time::Duration;
15 
16 use base::debug;
17 use base::error;
18 use base::gettid;
19 use base::warn;
20 use base::AsRawDescriptor;
21 use base::Descriptor;
22 use base::Error as SysError;
23 use base::Event;
24 use base::EventToken;
25 use base::SendTube;
26 use base::Timer;
27 use base::TimerTrait;
28 use base::VmEventType;
29 use base::WaitContext;
30 use base::WorkerThread;
31 use remain::sorted;
32 use sync::Mutex;
33 use thiserror::Error;
34 
35 use crate::pci::CrosvmDeviceId;
36 use crate::BusAccessInfo;
37 use crate::BusDevice;
38 use crate::DeviceId;
39 use crate::Suspendable;
40 
41 // Registers offsets
42 const VMWDT_REG_STATUS: u32 = 0x00;
43 const VMWDT_REG_LOAD_CNT: u32 = 0x04;
44 const VMWDT_REG_CURRENT_CNT: u32 = 0x08;
45 const VMWDT_REG_CLOCK_FREQ_HZ: u32 = 0x0C;
46 
47 // Length of the registers
48 const VMWDT_REG_LEN: u64 = 0x10;
49 
50 pub const VMWDT_DEFAULT_TIMEOUT_SEC: u32 = 10;
51 pub const VMWDT_DEFAULT_CLOCK_HZ: u32 = 2;
52 
53 // Proc stat indexes
54 const PROCSTAT_GUEST_TIME_INDX: usize = 42;
55 
56 #[sorted]
57 #[derive(Error, Debug)]
58 pub enum VmwdtError {
59     /// Error while creating event.
60     #[error("failed to create event: {0}")]
61     CreateEvent(SysError),
62     /// Error while trying to create worker thread.
63     #[error("failed to spawn thread: {0}")]
64     SpawnThread(IoError),
65     /// Error while trying to create timer.
66     #[error("failed to create vmwdt counter due to timer fd: {0}")]
67     TimerCreateError(SysError),
68     #[error("failed to wait for events: {0}")]
69     WaitError(SysError),
70 }
71 
72 type VmwdtResult<T> = std::result::Result<T, VmwdtError>;
73 
74 pub struct VmwdtPerCpu {
75     // Flag which indicated if the watchdog is started
76     is_enabled: bool,
77     // Timer used to generate periodic events at `timer_freq_hz` frequency
78     timer: Timer,
79     // The frequency of the `timer`
80     timer_freq_hz: u64,
81     // Timestamp measured in miliseconds of the last guest activity
82     last_guest_time_ms: i64,
83     // The pid of the thread this vcpu belongs to
84     pid: u32,
85     // The process id of the task this vcpu belongs to
86     ppid: u32,
87     // The pre-programmed one-shot expiration interval. If the guest runs in this
88     // interval but we don't receive a periodic event, the guest is stalled.
89     next_expiration_interval_ms: i64,
90 }
91 
92 pub struct Vmwdt {
93     vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
94     // The worker thread that waits on the timer fd
95     worker_thread: Option<WorkerThread<()>>,
96     // TODO: @sebastianene add separate reset event for the watchdog
97     // Reset source if the device is not responding
98     reset_evt_wrtube: SendTube,
99     activated: bool,
100 }
101 
102 impl Vmwdt {
new(cpu_count: usize, reset_evt_wrtube: SendTube) -> VmwdtResult<Vmwdt>103     pub fn new(cpu_count: usize, reset_evt_wrtube: SendTube) -> VmwdtResult<Vmwdt> {
104         let mut vec = Vec::new();
105         for _ in 0..cpu_count {
106             vec.push(VmwdtPerCpu {
107                 last_guest_time_ms: 0,
108                 pid: 0,
109                 ppid: 0,
110                 is_enabled: false,
111                 timer: Timer::new().unwrap(),
112                 timer_freq_hz: 0,
113                 next_expiration_interval_ms: 0,
114             });
115         }
116         let vm_wdts = Arc::new(Mutex::new(vec));
117 
118         Ok(Vmwdt {
119             vm_wdts,
120             worker_thread: None,
121             reset_evt_wrtube,
122             activated: false,
123         })
124     }
125 
vmwdt_worker_thread( vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>, kill_evt: Event, reset_evt_wrtube: SendTube, )126     pub fn vmwdt_worker_thread(
127         vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>,
128         kill_evt: Event,
129         reset_evt_wrtube: SendTube,
130     ) {
131         #[derive(EventToken)]
132         enum Token {
133             Kill,
134             Timer(usize),
135         }
136 
137         let wait_ctx: WaitContext<Token> = WaitContext::new().unwrap();
138         wait_ctx.add(&kill_evt, Token::Kill).unwrap();
139 
140         let len = vm_wdts.lock().len();
141         for clock_id in 0..len {
142             let timer_fd = vm_wdts.lock()[clock_id].timer.as_raw_descriptor();
143             wait_ctx
144                 .add(&Descriptor(timer_fd), Token::Timer(clock_id))
145                 .unwrap();
146         }
147 
148         loop {
149             let events = wait_ctx.wait().unwrap();
150             for event in events.iter().filter(|e| e.is_readable) {
151                 match event.token {
152                     Token::Kill => {
153                         return;
154                     }
155                     Token::Timer(cpu_id) => {
156                         let mut wdts_locked = vm_wdts.lock();
157                         let watchdog = &mut wdts_locked[cpu_id];
158                         if let Err(_e) = watchdog.timer.wait() {
159                             error!("error waiting for timer event on vcpu {}", cpu_id);
160                         }
161 
162                         let current_guest_time_ms_result =
163                             Vmwdt::get_guest_time_ms(watchdog.ppid, watchdog.pid);
164                         let current_guest_time_ms = match current_guest_time_ms_result {
165                             Ok(value) => value,
166                             Err(_e) => return,
167                         };
168                         let remaining_time_ms = watchdog.next_expiration_interval_ms
169                             - (current_guest_time_ms - watchdog.last_guest_time_ms);
170 
171                         if remaining_time_ms > 0 {
172                             watchdog.next_expiration_interval_ms = remaining_time_ms;
173                             if let Err(_e) = watchdog
174                                 .timer
175                                 .reset(Duration::from_millis(remaining_time_ms as u64), None)
176                             {
177                                 error!("failed to reset internal timer on vcpu {}", cpu_id);
178                             }
179                         } else {
180                             // The guest ran but it did not send the periodic event
181                             if let Err(_e) =
182                                 reset_evt_wrtube.send::<VmEventType>(&VmEventType::WatchdogReset)
183                             {
184                                 error!("failed to send reset event from vcpu {}", cpu_id)
185                             }
186                         }
187                     }
188                 }
189             }
190         }
191     }
192 
start(&mut self)193     fn start(&mut self) {
194         let vm_wdts = self.vm_wdts.clone();
195         let reset_evt_wrtube = self.reset_evt_wrtube.try_clone().unwrap();
196 
197         self.activated = true;
198         self.worker_thread = Some(WorkerThread::start("vmwdt worker", |kill_evt| {
199             Vmwdt::vmwdt_worker_thread(vm_wdts, kill_evt, reset_evt_wrtube)
200         }));
201     }
202 
ensure_started(&mut self)203     fn ensure_started(&mut self) {
204         if self.worker_thread.is_some() {
205             return;
206         }
207 
208         self.start();
209     }
210 
211     #[cfg(any(target_os = "linux", target_os = "android"))]
get_guest_time_ms(ppid: u32, pid: u32) -> Result<i64, SysError>212     pub fn get_guest_time_ms(ppid: u32, pid: u32) -> Result<i64, SysError> {
213         // TODO: @sebastianene check if we can avoid open-read-close on each call
214         let stat_path = format!("/proc/{}/task/{}/stat", ppid, pid);
215         let contents = fs::read_to_string(stat_path)?;
216 
217         let gtime_ticks = contents
218             .split_whitespace()
219             .nth(PROCSTAT_GUEST_TIME_INDX)
220             .and_then(|guest_time| guest_time.parse::<u64>().ok())
221             .unwrap_or(0);
222 
223         // SAFETY:
224         // Safe because this just returns an integer
225         let ticks_per_sec = unsafe { libc::sysconf(libc::_SC_CLK_TCK) } as u64;
226         Ok((gtime_ticks * 1000 / ticks_per_sec) as i64)
227     }
228 
229     #[cfg(not(any(target_os = "linux", target_os = "android")))]
get_guest_time_ms(ppid: u32, pid: u32) -> Result<i64, SysError>230     pub fn get_guest_time_ms(ppid: u32, pid: u32) -> Result<i64, SysError> {
231         Ok(0)
232     }
233 }
234 
235 impl BusDevice for Vmwdt {
debug_label(&self) -> String236     fn debug_label(&self) -> String {
237         "Vmwdt".to_owned()
238     }
239 
device_id(&self) -> DeviceId240     fn device_id(&self) -> DeviceId {
241         CrosvmDeviceId::VmWatchdog.into()
242     }
243 
read(&mut self, _offset: BusAccessInfo, _data: &mut [u8])244     fn read(&mut self, _offset: BusAccessInfo, _data: &mut [u8]) {}
245 
write(&mut self, info: BusAccessInfo, data: &[u8])246     fn write(&mut self, info: BusAccessInfo, data: &[u8]) {
247         let data_array = match <&[u8; 4]>::try_from(data) {
248             Ok(array) => array,
249             _ => {
250                 error!("Bad write size: {} for vmwdt", data.len());
251                 return;
252             }
253         };
254 
255         let reg_val = u32::from_ne_bytes(*data_array);
256         let cpu_index: usize = (info.offset / VMWDT_REG_LEN) as usize;
257         let reg_offset = (info.offset % VMWDT_REG_LEN) as u32;
258 
259         if cpu_index > self.vm_wdts.lock().len() {
260             error!("Bad write cpu_index {}", cpu_index);
261             return;
262         }
263 
264         match reg_offset {
265             VMWDT_REG_STATUS => {
266                 self.ensure_started();
267                 let mut wdts_locked = self.vm_wdts.lock();
268                 let cpu_watchdog = &mut wdts_locked[cpu_index];
269 
270                 cpu_watchdog.is_enabled = reg_val != 0;
271 
272                 if reg_val != 0 {
273                     let due = Duration::from_nanos(1);
274                     let interval = Duration::from_millis(1000 / cpu_watchdog.timer_freq_hz);
275                     cpu_watchdog.timer.reset(due, Some(interval)).unwrap();
276                 } else {
277                     cpu_watchdog.timer.clear().unwrap();
278                 }
279             }
280             VMWDT_REG_LOAD_CNT => {
281                 let ppid = process::id();
282                 let pid = gettid();
283                 let guest_time_ms_result = Vmwdt::get_guest_time_ms(ppid, pid as u32);
284                 let guest_time_ms = match guest_time_ms_result {
285                     Ok(time) => time,
286                     Err(_e) => return,
287                 };
288 
289                 let mut wdts_locked = self.vm_wdts.lock();
290                 let cpu_watchdog = &mut wdts_locked[cpu_index];
291                 let next_expiration_interval_ms =
292                     reg_val as u64 * 1000 / cpu_watchdog.timer_freq_hz;
293 
294                 cpu_watchdog.pid = pid as u32;
295                 cpu_watchdog.ppid = ppid;
296                 cpu_watchdog.last_guest_time_ms = guest_time_ms;
297                 cpu_watchdog.next_expiration_interval_ms = next_expiration_interval_ms as i64;
298 
299                 if cpu_watchdog.is_enabled {
300                     if let Err(_e) = cpu_watchdog
301                         .timer
302                         .reset(Duration::from_millis(next_expiration_interval_ms), None)
303                     {
304                         error!("failed to reset one-shot vcpu time {}", cpu_index);
305                     }
306                 }
307             }
308             VMWDT_REG_CURRENT_CNT => {
309                 warn!("invalid write to read-only VMWDT_REG_CURRENT_CNT register");
310             }
311             VMWDT_REG_CLOCK_FREQ_HZ => {
312                 let mut wdts_locked = self.vm_wdts.lock();
313                 let cpu_watchdog = &mut wdts_locked[cpu_index];
314 
315                 debug!(
316                     "CPU:{:x} wrote VMWDT_REG_CLOCK_FREQ_HZ {:x}",
317                     cpu_index, reg_val
318                 );
319                 cpu_watchdog.timer_freq_hz = reg_val as u64;
320             }
321             _ => unreachable!(),
322         }
323     }
324 }
325 
326 impl Suspendable for Vmwdt {
sleep(&mut self) -> anyhow::Result<()>327     fn sleep(&mut self) -> anyhow::Result<()> {
328         if let Some(worker) = self.worker_thread.take() {
329             worker.stop();
330         }
331         Ok(())
332     }
333 
wake(&mut self) -> anyhow::Result<()>334     fn wake(&mut self) -> anyhow::Result<()> {
335         if self.activated {
336             self.start();
337         }
338         Ok(())
339     }
340 }
341 
342 #[cfg(test)]
343 mod tests {
344     use std::thread::sleep;
345 
346     use base::poll_assert;
347     use base::Tube;
348 
349     use super::*;
350 
351     const AARCH64_VMWDT_ADDR: u64 = 0x3000;
352     const TEST_VMWDT_CPU_NO: usize = 0x1;
353 
vmwdt_bus_address(offset: u64) -> BusAccessInfo354     fn vmwdt_bus_address(offset: u64) -> BusAccessInfo {
355         BusAccessInfo {
356             offset,
357             address: AARCH64_VMWDT_ADDR,
358             id: 0,
359         }
360     }
361 
362     #[test]
test_watchdog_internal_timer()363     fn test_watchdog_internal_timer() {
364         let (vm_evt_wrtube, _vm_evt_rdtube) = Tube::directional_pair().unwrap();
365         let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube).unwrap();
366 
367         // Configure the watchdog device, 2Hz internal clock
368         device.write(
369             vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64),
370             &[10, 0, 0, 0],
371         );
372         device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]);
373         device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]);
374         let next_expiration_ms = {
375             let mut vmwdt_locked = device.vm_wdts.lock();
376             // In the test scenario the guest does not interpret the /proc/stat::guest_time, thus
377             // the function get_guest_time() returns 0
378             vmwdt_locked[0].last_guest_time_ms = 10;
379             vmwdt_locked[0].next_expiration_interval_ms
380         };
381 
382         // Poll multiple times as we don't get a signal when the watchdog thread has run.
383         poll_assert!(10, || {
384             sleep(Duration::from_millis(50));
385             let vmwdt_locked = device.vm_wdts.lock();
386             // Verify that our timer expired and the next_expiration_interval_ms changed
387             vmwdt_locked[0].next_expiration_interval_ms != next_expiration_ms
388         });
389     }
390 
391     #[test]
test_watchdog_expiration()392     fn test_watchdog_expiration() {
393         let (vm_evt_wrtube, vm_evt_rdtube) = Tube::directional_pair().unwrap();
394         let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube).unwrap();
395 
396         // Configure the watchdog device, 2Hz internal clock
397         device.write(
398             vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64),
399             &[10, 0, 0, 0],
400         );
401         device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]);
402         device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]);
403         // In the test scenario the guest does not interpret the /proc/stat::guest_time, thus
404         // the function get_guest_time() returns 0
405         device.vm_wdts.lock()[0].last_guest_time_ms = -100;
406 
407         // Poll multiple times as we don't get a signal when the watchdog thread has run.
408         poll_assert!(10, || {
409             sleep(Duration::from_millis(50));
410             match vm_evt_rdtube.recv::<VmEventType>() {
411                 Ok(vm_event) => vm_event == VmEventType::WatchdogReset,
412                 Err(_e) => false,
413             }
414         });
415     }
416 }
417