1 // Copyright 2022 The ChromiumOS Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 //! vmwdt is a virtual watchdog memory mapped device which detects stalls 6 //! on the vCPUs and resets the guest when no 'pet' events are received. 7 //! <https://docs.google.com/document/d/1DYmk2roxlwHZsOfcJi8xDMdWOHAmomvs2SDh7KPud3Y/edit?usp=sharing&resourcekey=0-oSNabc-t040a1q0K4cyI8Q> 8 9 use std::convert::TryFrom; 10 use std::fs; 11 use std::io::Error as IoError; 12 use std::process; 13 use std::sync::Arc; 14 use std::time::Duration; 15 16 use base::debug; 17 use base::error; 18 use base::gettid; 19 use base::warn; 20 use base::AsRawDescriptor; 21 use base::Descriptor; 22 use base::Error as SysError; 23 use base::Event; 24 use base::EventToken; 25 use base::SendTube; 26 use base::Timer; 27 use base::TimerTrait; 28 use base::VmEventType; 29 use base::WaitContext; 30 use base::WorkerThread; 31 use remain::sorted; 32 use sync::Mutex; 33 use thiserror::Error; 34 35 use crate::pci::CrosvmDeviceId; 36 use crate::BusAccessInfo; 37 use crate::BusDevice; 38 use crate::DeviceId; 39 use crate::Suspendable; 40 41 // Registers offsets 42 const VMWDT_REG_STATUS: u32 = 0x00; 43 const VMWDT_REG_LOAD_CNT: u32 = 0x04; 44 const VMWDT_REG_CURRENT_CNT: u32 = 0x08; 45 const VMWDT_REG_CLOCK_FREQ_HZ: u32 = 0x0C; 46 47 // Length of the registers 48 const VMWDT_REG_LEN: u64 = 0x10; 49 50 pub const VMWDT_DEFAULT_TIMEOUT_SEC: u32 = 10; 51 pub const VMWDT_DEFAULT_CLOCK_HZ: u32 = 2; 52 53 // Proc stat indexes 54 const PROCSTAT_GUEST_TIME_INDX: usize = 42; 55 56 #[sorted] 57 #[derive(Error, Debug)] 58 pub enum VmwdtError { 59 /// Error while creating event. 60 #[error("failed to create event: {0}")] 61 CreateEvent(SysError), 62 /// Error while trying to create worker thread. 63 #[error("failed to spawn thread: {0}")] 64 SpawnThread(IoError), 65 /// Error while trying to create timer. 66 #[error("failed to create vmwdt counter due to timer fd: {0}")] 67 TimerCreateError(SysError), 68 #[error("failed to wait for events: {0}")] 69 WaitError(SysError), 70 } 71 72 type VmwdtResult<T> = std::result::Result<T, VmwdtError>; 73 74 pub struct VmwdtPerCpu { 75 // Flag which indicated if the watchdog is started 76 is_enabled: bool, 77 // Timer used to generate periodic events at `timer_freq_hz` frequency 78 timer: Timer, 79 // The frequency of the `timer` 80 timer_freq_hz: u64, 81 // Timestamp measured in miliseconds of the last guest activity 82 last_guest_time_ms: i64, 83 // The pid of the thread this vcpu belongs to 84 pid: u32, 85 // The process id of the task this vcpu belongs to 86 ppid: u32, 87 // The pre-programmed one-shot expiration interval. If the guest runs in this 88 // interval but we don't receive a periodic event, the guest is stalled. 89 next_expiration_interval_ms: i64, 90 } 91 92 pub struct Vmwdt { 93 vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>, 94 // The worker thread that waits on the timer fd 95 worker_thread: Option<WorkerThread<()>>, 96 // TODO: @sebastianene add separate reset event for the watchdog 97 // Reset source if the device is not responding 98 reset_evt_wrtube: SendTube, 99 activated: bool, 100 } 101 102 impl Vmwdt { new(cpu_count: usize, reset_evt_wrtube: SendTube) -> VmwdtResult<Vmwdt>103 pub fn new(cpu_count: usize, reset_evt_wrtube: SendTube) -> VmwdtResult<Vmwdt> { 104 let mut vec = Vec::new(); 105 for _ in 0..cpu_count { 106 vec.push(VmwdtPerCpu { 107 last_guest_time_ms: 0, 108 pid: 0, 109 ppid: 0, 110 is_enabled: false, 111 timer: Timer::new().unwrap(), 112 timer_freq_hz: 0, 113 next_expiration_interval_ms: 0, 114 }); 115 } 116 let vm_wdts = Arc::new(Mutex::new(vec)); 117 118 Ok(Vmwdt { 119 vm_wdts, 120 worker_thread: None, 121 reset_evt_wrtube, 122 activated: false, 123 }) 124 } 125 vmwdt_worker_thread( vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>, kill_evt: Event, reset_evt_wrtube: SendTube, )126 pub fn vmwdt_worker_thread( 127 vm_wdts: Arc<Mutex<Vec<VmwdtPerCpu>>>, 128 kill_evt: Event, 129 reset_evt_wrtube: SendTube, 130 ) { 131 #[derive(EventToken)] 132 enum Token { 133 Kill, 134 Timer(usize), 135 } 136 137 let wait_ctx: WaitContext<Token> = WaitContext::new().unwrap(); 138 wait_ctx.add(&kill_evt, Token::Kill).unwrap(); 139 140 let len = vm_wdts.lock().len(); 141 for clock_id in 0..len { 142 let timer_fd = vm_wdts.lock()[clock_id].timer.as_raw_descriptor(); 143 wait_ctx 144 .add(&Descriptor(timer_fd), Token::Timer(clock_id)) 145 .unwrap(); 146 } 147 148 loop { 149 let events = wait_ctx.wait().unwrap(); 150 for event in events.iter().filter(|e| e.is_readable) { 151 match event.token { 152 Token::Kill => { 153 return; 154 } 155 Token::Timer(cpu_id) => { 156 let mut wdts_locked = vm_wdts.lock(); 157 let watchdog = &mut wdts_locked[cpu_id]; 158 if let Err(_e) = watchdog.timer.wait() { 159 error!("error waiting for timer event on vcpu {}", cpu_id); 160 } 161 162 let current_guest_time_ms_result = 163 Vmwdt::get_guest_time_ms(watchdog.ppid, watchdog.pid); 164 let current_guest_time_ms = match current_guest_time_ms_result { 165 Ok(value) => value, 166 Err(_e) => return, 167 }; 168 let remaining_time_ms = watchdog.next_expiration_interval_ms 169 - (current_guest_time_ms - watchdog.last_guest_time_ms); 170 171 if remaining_time_ms > 0 { 172 watchdog.next_expiration_interval_ms = remaining_time_ms; 173 if let Err(_e) = watchdog 174 .timer 175 .reset(Duration::from_millis(remaining_time_ms as u64), None) 176 { 177 error!("failed to reset internal timer on vcpu {}", cpu_id); 178 } 179 } else { 180 // The guest ran but it did not send the periodic event 181 if let Err(_e) = 182 reset_evt_wrtube.send::<VmEventType>(&VmEventType::WatchdogReset) 183 { 184 error!("failed to send reset event from vcpu {}", cpu_id) 185 } 186 } 187 } 188 } 189 } 190 } 191 } 192 start(&mut self)193 fn start(&mut self) { 194 let vm_wdts = self.vm_wdts.clone(); 195 let reset_evt_wrtube = self.reset_evt_wrtube.try_clone().unwrap(); 196 197 self.activated = true; 198 self.worker_thread = Some(WorkerThread::start("vmwdt worker", |kill_evt| { 199 Vmwdt::vmwdt_worker_thread(vm_wdts, kill_evt, reset_evt_wrtube) 200 })); 201 } 202 ensure_started(&mut self)203 fn ensure_started(&mut self) { 204 if self.worker_thread.is_some() { 205 return; 206 } 207 208 self.start(); 209 } 210 211 #[cfg(any(target_os = "linux", target_os = "android"))] get_guest_time_ms(ppid: u32, pid: u32) -> Result<i64, SysError>212 pub fn get_guest_time_ms(ppid: u32, pid: u32) -> Result<i64, SysError> { 213 // TODO: @sebastianene check if we can avoid open-read-close on each call 214 let stat_path = format!("/proc/{}/task/{}/stat", ppid, pid); 215 let contents = fs::read_to_string(stat_path)?; 216 217 let gtime_ticks = contents 218 .split_whitespace() 219 .nth(PROCSTAT_GUEST_TIME_INDX) 220 .and_then(|guest_time| guest_time.parse::<u64>().ok()) 221 .unwrap_or(0); 222 223 // SAFETY: 224 // Safe because this just returns an integer 225 let ticks_per_sec = unsafe { libc::sysconf(libc::_SC_CLK_TCK) } as u64; 226 Ok((gtime_ticks * 1000 / ticks_per_sec) as i64) 227 } 228 229 #[cfg(not(any(target_os = "linux", target_os = "android")))] get_guest_time_ms(ppid: u32, pid: u32) -> Result<i64, SysError>230 pub fn get_guest_time_ms(ppid: u32, pid: u32) -> Result<i64, SysError> { 231 Ok(0) 232 } 233 } 234 235 impl BusDevice for Vmwdt { debug_label(&self) -> String236 fn debug_label(&self) -> String { 237 "Vmwdt".to_owned() 238 } 239 device_id(&self) -> DeviceId240 fn device_id(&self) -> DeviceId { 241 CrosvmDeviceId::VmWatchdog.into() 242 } 243 read(&mut self, _offset: BusAccessInfo, _data: &mut [u8])244 fn read(&mut self, _offset: BusAccessInfo, _data: &mut [u8]) {} 245 write(&mut self, info: BusAccessInfo, data: &[u8])246 fn write(&mut self, info: BusAccessInfo, data: &[u8]) { 247 let data_array = match <&[u8; 4]>::try_from(data) { 248 Ok(array) => array, 249 _ => { 250 error!("Bad write size: {} for vmwdt", data.len()); 251 return; 252 } 253 }; 254 255 let reg_val = u32::from_ne_bytes(*data_array); 256 let cpu_index: usize = (info.offset / VMWDT_REG_LEN) as usize; 257 let reg_offset = (info.offset % VMWDT_REG_LEN) as u32; 258 259 if cpu_index > self.vm_wdts.lock().len() { 260 error!("Bad write cpu_index {}", cpu_index); 261 return; 262 } 263 264 match reg_offset { 265 VMWDT_REG_STATUS => { 266 self.ensure_started(); 267 let mut wdts_locked = self.vm_wdts.lock(); 268 let cpu_watchdog = &mut wdts_locked[cpu_index]; 269 270 cpu_watchdog.is_enabled = reg_val != 0; 271 272 if reg_val != 0 { 273 let due = Duration::from_nanos(1); 274 let interval = Duration::from_millis(1000 / cpu_watchdog.timer_freq_hz); 275 cpu_watchdog.timer.reset(due, Some(interval)).unwrap(); 276 } else { 277 cpu_watchdog.timer.clear().unwrap(); 278 } 279 } 280 VMWDT_REG_LOAD_CNT => { 281 let ppid = process::id(); 282 let pid = gettid(); 283 let guest_time_ms_result = Vmwdt::get_guest_time_ms(ppid, pid as u32); 284 let guest_time_ms = match guest_time_ms_result { 285 Ok(time) => time, 286 Err(_e) => return, 287 }; 288 289 let mut wdts_locked = self.vm_wdts.lock(); 290 let cpu_watchdog = &mut wdts_locked[cpu_index]; 291 let next_expiration_interval_ms = 292 reg_val as u64 * 1000 / cpu_watchdog.timer_freq_hz; 293 294 cpu_watchdog.pid = pid as u32; 295 cpu_watchdog.ppid = ppid; 296 cpu_watchdog.last_guest_time_ms = guest_time_ms; 297 cpu_watchdog.next_expiration_interval_ms = next_expiration_interval_ms as i64; 298 299 if cpu_watchdog.is_enabled { 300 if let Err(_e) = cpu_watchdog 301 .timer 302 .reset(Duration::from_millis(next_expiration_interval_ms), None) 303 { 304 error!("failed to reset one-shot vcpu time {}", cpu_index); 305 } 306 } 307 } 308 VMWDT_REG_CURRENT_CNT => { 309 warn!("invalid write to read-only VMWDT_REG_CURRENT_CNT register"); 310 } 311 VMWDT_REG_CLOCK_FREQ_HZ => { 312 let mut wdts_locked = self.vm_wdts.lock(); 313 let cpu_watchdog = &mut wdts_locked[cpu_index]; 314 315 debug!( 316 "CPU:{:x} wrote VMWDT_REG_CLOCK_FREQ_HZ {:x}", 317 cpu_index, reg_val 318 ); 319 cpu_watchdog.timer_freq_hz = reg_val as u64; 320 } 321 _ => unreachable!(), 322 } 323 } 324 } 325 326 impl Suspendable for Vmwdt { sleep(&mut self) -> anyhow::Result<()>327 fn sleep(&mut self) -> anyhow::Result<()> { 328 if let Some(worker) = self.worker_thread.take() { 329 worker.stop(); 330 } 331 Ok(()) 332 } 333 wake(&mut self) -> anyhow::Result<()>334 fn wake(&mut self) -> anyhow::Result<()> { 335 if self.activated { 336 self.start(); 337 } 338 Ok(()) 339 } 340 } 341 342 #[cfg(test)] 343 mod tests { 344 use std::thread::sleep; 345 346 use base::poll_assert; 347 use base::Tube; 348 349 use super::*; 350 351 const AARCH64_VMWDT_ADDR: u64 = 0x3000; 352 const TEST_VMWDT_CPU_NO: usize = 0x1; 353 vmwdt_bus_address(offset: u64) -> BusAccessInfo354 fn vmwdt_bus_address(offset: u64) -> BusAccessInfo { 355 BusAccessInfo { 356 offset, 357 address: AARCH64_VMWDT_ADDR, 358 id: 0, 359 } 360 } 361 362 #[test] test_watchdog_internal_timer()363 fn test_watchdog_internal_timer() { 364 let (vm_evt_wrtube, _vm_evt_rdtube) = Tube::directional_pair().unwrap(); 365 let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube).unwrap(); 366 367 // Configure the watchdog device, 2Hz internal clock 368 device.write( 369 vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64), 370 &[10, 0, 0, 0], 371 ); 372 device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]); 373 device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]); 374 let next_expiration_ms = { 375 let mut vmwdt_locked = device.vm_wdts.lock(); 376 // In the test scenario the guest does not interpret the /proc/stat::guest_time, thus 377 // the function get_guest_time() returns 0 378 vmwdt_locked[0].last_guest_time_ms = 10; 379 vmwdt_locked[0].next_expiration_interval_ms 380 }; 381 382 // Poll multiple times as we don't get a signal when the watchdog thread has run. 383 poll_assert!(10, || { 384 sleep(Duration::from_millis(50)); 385 let vmwdt_locked = device.vm_wdts.lock(); 386 // Verify that our timer expired and the next_expiration_interval_ms changed 387 vmwdt_locked[0].next_expiration_interval_ms != next_expiration_ms 388 }); 389 } 390 391 #[test] test_watchdog_expiration()392 fn test_watchdog_expiration() { 393 let (vm_evt_wrtube, vm_evt_rdtube) = Tube::directional_pair().unwrap(); 394 let mut device = Vmwdt::new(TEST_VMWDT_CPU_NO, vm_evt_wrtube).unwrap(); 395 396 // Configure the watchdog device, 2Hz internal clock 397 device.write( 398 vmwdt_bus_address(VMWDT_REG_CLOCK_FREQ_HZ as u64), 399 &[10, 0, 0, 0], 400 ); 401 device.write(vmwdt_bus_address(VMWDT_REG_LOAD_CNT as u64), &[1, 0, 0, 0]); 402 device.write(vmwdt_bus_address(VMWDT_REG_STATUS as u64), &[1, 0, 0, 0]); 403 // In the test scenario the guest does not interpret the /proc/stat::guest_time, thus 404 // the function get_guest_time() returns 0 405 device.vm_wdts.lock()[0].last_guest_time_ms = -100; 406 407 // Poll multiple times as we don't get a signal when the watchdog thread has run. 408 poll_assert!(10, || { 409 sleep(Duration::from_millis(50)); 410 match vm_evt_rdtube.recv::<VmEventType>() { 411 Ok(vm_event) => vm_event == VmEventType::WatchdogReset, 412 Err(_e) => false, 413 } 414 }); 415 } 416 } 417