1 // Copyright 2020 The ChromiumOS Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 use std::convert::TryFrom; 6 use std::convert::TryInto; 7 use std::fmt; 8 use std::fmt::Display; 9 use std::iter; 10 use std::sync::Arc; 11 12 cfg_if::cfg_if! { 13 if #[cfg(test)] { 14 use base::{FakeClock as Clock, FakeTimer as Timer}; 15 } else { 16 use base::{Clock, Timer}; 17 } 18 } 19 use anyhow::Context; 20 use base::error; 21 use base::info; 22 use base::warn; 23 use base::AsRawDescriptor; 24 use base::Descriptor; 25 use base::Error; 26 use base::Event; 27 use base::EventToken; 28 use base::Result; 29 use base::Tube; 30 use base::WaitContext; 31 use base::WorkerThread; 32 use hypervisor::DeliveryMode; 33 use hypervisor::IoapicState; 34 use hypervisor::IrqRoute; 35 use hypervisor::IrqSource; 36 use hypervisor::IrqSourceChip; 37 use hypervisor::LapicState; 38 use hypervisor::MPState; 39 use hypervisor::MsiAddressMessage; 40 use hypervisor::MsiDataMessage; 41 use hypervisor::PicSelect; 42 use hypervisor::PicState; 43 use hypervisor::PitState; 44 use hypervisor::Vcpu; 45 use hypervisor::VcpuX86_64; 46 use resources::SystemAllocator; 47 use snapshot::AnySnapshot; 48 use sync::Condvar; 49 use sync::Mutex; 50 51 use crate::bus::BusDeviceSync; 52 use crate::irqchip::Apic; 53 use crate::irqchip::ApicBusMsg; 54 use crate::irqchip::DelayedIoApicIrqEvents; 55 use crate::irqchip::Interrupt; 56 use crate::irqchip::InterruptData; 57 use crate::irqchip::InterruptDestination; 58 use crate::irqchip::Ioapic; 59 use crate::irqchip::IrqEvent; 60 use crate::irqchip::IrqEventIndex; 61 use crate::irqchip::Pic; 62 use crate::irqchip::Routes; 63 use crate::irqchip::VcpuRunState; 64 use crate::irqchip::APIC_BASE_ADDRESS; 65 use crate::irqchip::APIC_MEM_LENGTH_BYTES; 66 use crate::irqchip::IOAPIC_BASE_ADDRESS; 67 use crate::irqchip::IOAPIC_MEM_LENGTH_BYTES; 68 use crate::pci::CrosvmDeviceId; 69 use crate::Bus; 70 use crate::BusAccessInfo; 71 use crate::BusDevice; 72 use crate::DeviceId; 73 use crate::IrqChip; 74 use crate::IrqChipCap; 75 use crate::IrqChipX86_64; 76 use crate::IrqEdgeEvent; 77 use crate::IrqEventSource; 78 use crate::IrqLevelEvent; 79 use crate::Pit; 80 use crate::PitError; 81 use crate::Suspendable; 82 83 /// PIT channel 0 timer is connected to IRQ 0 84 const PIT_CHANNEL0_IRQ: u32 = 0; 85 /// CR0 extension type bit 86 const X86_CR0_ET: u64 = 0x00000010; 87 /// CR0 not write through bit 88 const X86_CR0_NW: u64 = 0x20000000; 89 /// CR0 cache disable bit 90 const X86_CR0_CD: u64 = 0x40000000; 91 /// Default power on state of CR0 register, according to the Intel manual. 92 const X86_CR0_INIT: u64 = X86_CR0_ET | X86_CR0_NW | X86_CR0_CD; 93 94 /// An `IrqChip` with all interrupt devices emulated in userspace. `UserspaceIrqChip` works with 95 /// any hypervisor, but only supports x86. 96 pub struct UserspaceIrqChip<V: VcpuX86_64> { 97 pub vcpus: Arc<Mutex<Vec<Option<V>>>>, 98 routes: Arc<Mutex<Routes>>, 99 pit: Arc<Mutex<Pit>>, 100 pic: Arc<Mutex<Pic>>, 101 ioapic: Arc<Mutex<Ioapic>>, 102 ioapic_pins: usize, 103 pub apics: Vec<Arc<Mutex<Apic>>>, 104 // Condition variables used by wait_until_runnable. 105 waiters: Vec<Arc<Waiter>>, 106 // Raw descriptors of the apic Timers. 107 timer_descriptors: Vec<Descriptor>, 108 /// Delayed ioapic irq object, that contains the delayed events because the ioapic was locked 109 /// when service_irq was called on the irqchip. This prevents deadlocks when a Vcpu thread has 110 /// locked the ioapic and the ioapic sends a AddMsiRoute signal to the main thread (which 111 /// itself may be busy trying to call service_irq). 112 /// 113 /// ## Note: 114 /// This lock may be locked by itself to access the `DelayedIoApicIrqEvents`. If accessed in 115 /// conjunction with the `irq_events` field, that lock should be taken first to prevent 116 /// deadlocks stemming from lock-ordering issues. 117 delayed_ioapic_irq_events: Arc<Mutex<DelayedIoApicIrqEvents>>, 118 // Array of Events that devices will use to assert ioapic pins. 119 irq_events: Arc<Mutex<Vec<Option<IrqEvent>>>>, 120 dropper: Arc<Mutex<Dropper>>, 121 activated: bool, 122 } 123 124 /// Helper that implements `Drop` on behalf of `UserspaceIrqChip`. The many cloned copies of an irq 125 /// chip share a single arc'ed `Dropper`, which only runs its drop when the last irq chip copy is 126 /// dropped. 127 struct Dropper { 128 /// Worker threads that deliver timer events to the APICs. 129 workers: Vec<WorkerThread<TimerWorkerResult<()>>>, 130 } 131 132 impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> { 133 /// Constructs a new `UserspaceIrqChip`. new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self>134 pub fn new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self> { 135 let clock = Arc::new(Mutex::new(Clock::new())); 136 Self::new_with_clock(num_vcpus, irq_tube, ioapic_pins, clock) 137 } 138 139 /// Constructs a new `UserspaceIrqChip`, with a clock. Used for testing. new_with_clock( num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>, clock: Arc<Mutex<Clock>>, ) -> Result<Self>140 pub fn new_with_clock( 141 num_vcpus: usize, 142 irq_tube: Tube, 143 ioapic_pins: Option<usize>, 144 clock: Arc<Mutex<Clock>>, 145 ) -> Result<Self> { 146 let pit_evt = IrqEdgeEvent::new()?; 147 // For test only, this clock instance is FakeClock. It needs to be cloned for every Timer 148 // instance, so make a clone for it now. 149 #[cfg(test)] 150 let test_clock = clock.clone(); 151 let pit = Pit::new(pit_evt.try_clone()?, clock).map_err(|e| match e { 152 PitError::CloneEvent(err) => err, 153 PitError::CreateEvent(err) => err, 154 PitError::CreateWaitContext(err) => err, 155 PitError::TimerCreateError(err) => err, 156 PitError::WaitError(err) => err, 157 PitError::SpawnThread(_) => Error::new(libc::EIO), 158 })?; 159 let pit_event_source = IrqEventSource::from_device(&pit); 160 161 let ioapic_pins = ioapic_pins.unwrap_or(hypervisor::NUM_IOAPIC_PINS); 162 let ioapic = Ioapic::new(irq_tube, ioapic_pins)?; 163 164 let mut timer_descriptors: Vec<Descriptor> = Vec::with_capacity(num_vcpus); 165 let mut apics: Vec<Arc<Mutex<Apic>>> = Vec::with_capacity(num_vcpus); 166 for id in 0..num_vcpus { 167 cfg_if::cfg_if! { 168 if #[cfg(test)] { 169 let timer = Timer::new(test_clock.clone()); 170 } else { 171 let timer = Timer::new()?; 172 } 173 } 174 // Timers are owned by the apics, which outlive the raw descriptors stored here and in 175 // the worker threads. 176 timer_descriptors.push(Descriptor(timer.as_raw_descriptor())); 177 178 let id: u8 = id.try_into().or(Err(Error::new(libc::EINVAL)))?; 179 let apic = Apic::new(id, Box::new(timer)); 180 apics.push(Arc::new(Mutex::new(apic))); 181 } 182 let dropper = Dropper { 183 workers: Vec::new(), 184 }; 185 186 let mut chip = UserspaceIrqChip { 187 vcpus: Arc::new(Mutex::new( 188 iter::repeat_with(|| None).take(num_vcpus).collect(), 189 )), 190 waiters: iter::repeat_with(Default::default) 191 .take(num_vcpus) 192 .collect(), 193 routes: Arc::new(Mutex::new(Routes::new())), 194 pit: Arc::new(Mutex::new(pit)), 195 pic: Arc::new(Mutex::new(Pic::new())), 196 ioapic: Arc::new(Mutex::new(ioapic)), 197 ioapic_pins, 198 apics, 199 timer_descriptors, 200 delayed_ioapic_irq_events: Arc::new(Mutex::new(DelayedIoApicIrqEvents::new()?)), 201 irq_events: Arc::new(Mutex::new(Vec::new())), 202 dropper: Arc::new(Mutex::new(dropper)), 203 activated: false, 204 }; 205 206 // Setup standard x86 irq routes 207 chip.set_irq_routes(&Routes::default_pic_ioapic_routes(ioapic_pins))?; 208 209 chip.register_edge_irq_event(PIT_CHANNEL0_IRQ, &pit_evt, pit_event_source)?; 210 Ok(chip) 211 } 212 213 /// Handles a message from an APIC. handle_msg(&self, msg: ApicBusMsg)214 fn handle_msg(&self, msg: ApicBusMsg) { 215 match msg { 216 ApicBusMsg::Eoi(vector) => { 217 let _ = self.broadcast_eoi(vector); 218 } 219 ApicBusMsg::Ipi(interrupt) => self.send_irq_to_apics(&interrupt), 220 } 221 } 222 223 /// Sends a Message Signaled Interrupt to one or more APICs. MSIs are a 64-bit address and 224 /// 32-bit data, but in the Intel spec we're implementing, only the low 32 bits of the address 225 /// are used. send_msi(&self, addr: u32, data: u32)226 fn send_msi(&self, addr: u32, data: u32) { 227 let mut msi_addr = MsiAddressMessage::new(); 228 msi_addr.set(0, 32, addr as u64); 229 let dest = match InterruptDestination::try_from(&msi_addr) { 230 Ok(dest) => dest, 231 Err(e) => { 232 warn!("Invalid MSI message: {}", e); 233 return; 234 } 235 }; 236 237 let mut msi_data = MsiDataMessage::new(); 238 msi_data.set(0, 32, data as u64); 239 let data = InterruptData::from(&msi_data); 240 241 self.send_irq_to_apics(&Interrupt { dest, data }); 242 } 243 send_irq_to_apic(&self, id: usize, irq: &InterruptData)244 pub fn send_irq_to_apic(&self, id: usize, irq: &InterruptData) { 245 // id can come from the guest, so check bounds. 246 if let Some(apic) = self.apics.get(id) { 247 apic.lock().accept_irq(irq); 248 } else { 249 error!("Interrupt for non-existent apic {}: {:?}", id, irq); 250 } 251 if let Some(Some(vcpu)) = self.vcpus.lock().get(id) { 252 vcpu.set_interrupt_window_requested(true); 253 } else { 254 error!("Interrupt for non-existent vcpu {}: {:?}", id, irq); 255 } 256 self.waiters[id].notify(); 257 } 258 259 /// Sends an interrupt to one or more APICs. Used for sending MSIs and IPIs. send_irq_to_apics(&self, irq: &Interrupt)260 pub fn send_irq_to_apics(&self, irq: &Interrupt) { 261 match irq.data.delivery { 262 DeliveryMode::Fixed | DeliveryMode::Lowest | DeliveryMode::RemoteRead => {} 263 _ => info!("UserspaceIrqChip received special irq: {:?}", irq), 264 } 265 266 // First try the fast path, where the destination is a single APIC we can send to directly. 267 if let Some(apic_id) = Apic::single_dest_fast(&irq.dest) { 268 self.send_irq_to_apic(apic_id as usize, &irq.data); 269 return; 270 } 271 272 let lowest_mode = irq.data.delivery == DeliveryMode::Lowest; 273 let mut lowest_priority = u8::MAX; 274 let mut lowest_apic: Option<usize> = None; 275 276 for (i, apic) in self.apics.iter().enumerate() { 277 let send = { 278 let apic = apic.lock(); 279 if !apic.match_dest(&irq.dest) { 280 false 281 } else if lowest_mode { 282 let priority = apic.get_processor_priority(); 283 if priority <= lowest_priority { 284 lowest_priority = priority; 285 lowest_apic = Some(i); 286 } 287 false 288 } else { 289 true 290 } 291 }; 292 if send { 293 self.send_irq_to_apic(i, &irq.data); 294 } 295 } 296 297 if lowest_mode { 298 if let Some(index) = lowest_apic { 299 self.send_irq_to_apic(index, &irq.data); 300 } else { 301 // According to sections 10.6.2.1 and 10.6.2.2 of the SDM, the OS should not let 302 // this happen. If the OS is misconfigured then drop the interrupt and log a 303 // warning. 304 warn!( 305 "Lowest priority interrupt sent, but no apics configured as valid target: {:?}", 306 irq 307 ); 308 } 309 } 310 } 311 312 /// Delivers a startup IPI to `vcpu`. deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()>313 fn deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()> { 314 // This comes from Intel SDM volume 3, chapter 8.4. The vector specifies a page aligned 315 // address where execution should start. cs.base is the offset for the code segment with an 316 // RIP of 0. The cs.selector is just the base shifted right by 4 bits. 317 let mut sregs = vcpu.get_sregs()?; 318 sregs.cs.base = (vector as u64) << 12; 319 sregs.cs.selector = (vector as u16) << 8; 320 321 // Set CR0 to its INIT value per the manual. Application processors won't boot with the CR0 322 // protected mode and paging bits set by setup_sregs(). Kernel APIC doesn't have this 323 // issue, probably because it uses MSRs instead of MMIO, so it's less affected when the AP's 324 // state (CR3 etc.) doesn't reflect changes that Linux made while booting vcpu 0. 325 sregs.cr0 = X86_CR0_INIT; 326 vcpu.set_sregs(&sregs)?; 327 328 let mut regs = vcpu.get_regs()?; 329 regs.rip = 0; 330 vcpu.set_regs(®s)?; 331 332 Ok(()) 333 } 334 335 /// Checks if the specified VCPU is in a runnable state. is_runnable(&self, vcpu_id: usize) -> bool336 fn is_runnable(&self, vcpu_id: usize) -> bool { 337 self.apics[vcpu_id].lock().get_mp_state() == MPState::Runnable 338 } 339 } 340 341 impl Dropper { sleep(&mut self) -> anyhow::Result<()>342 fn sleep(&mut self) -> anyhow::Result<()> { 343 for thread in self.workers.split_off(0).into_iter() { 344 thread 345 .stop() 346 .context("UserspaceIrqChip worker thread exited with error")?; 347 } 348 Ok(()) 349 } 350 } 351 352 impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> { register_irq_event( &mut self, irq: u32, irq_event: &Event, resample_event: Option<&Event>, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>353 fn register_irq_event( 354 &mut self, 355 irq: u32, 356 irq_event: &Event, 357 resample_event: Option<&Event>, 358 source: IrqEventSource, 359 ) -> Result<Option<IrqEventIndex>> { 360 let mut evt = IrqEvent { 361 gsi: irq, 362 event: irq_event.try_clone()?, 363 resample_event: None, 364 source, 365 }; 366 if let Some(resample_event) = resample_event { 367 evt.resample_event = Some(resample_event.try_clone()?); 368 } 369 370 let mut irq_events = self.irq_events.lock(); 371 let index = irq_events.len(); 372 irq_events.push(Some(evt)); 373 Ok(Some(index)) 374 } 375 unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()>376 fn unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()> { 377 let mut irq_events = self.irq_events.lock(); 378 for (index, evt) in irq_events.iter().enumerate() { 379 if let Some(evt) = evt { 380 if evt.gsi == irq && irq_event.eq(&evt.event) { 381 irq_events[index] = None; 382 break; 383 } 384 } 385 } 386 Ok(()) 387 } 388 } 389 390 impl<V: VcpuX86_64 + 'static> IrqChip for UserspaceIrqChip<V> { add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()>391 fn add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()> { 392 let vcpu: &V = vcpu 393 .downcast_ref() 394 .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type"); 395 self.vcpus.lock()[vcpu_id] = Some(vcpu.try_clone()?); 396 Ok(()) 397 } 398 register_edge_irq_event( &mut self, irq: u32, irq_event: &IrqEdgeEvent, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>399 fn register_edge_irq_event( 400 &mut self, 401 irq: u32, 402 irq_event: &IrqEdgeEvent, 403 source: IrqEventSource, 404 ) -> Result<Option<IrqEventIndex>> { 405 self.register_irq_event(irq, irq_event.get_trigger(), None, source) 406 } 407 unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()>408 fn unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()> { 409 self.unregister_irq_event(irq, irq_event.get_trigger()) 410 } 411 register_level_irq_event( &mut self, irq: u32, irq_event: &IrqLevelEvent, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>412 fn register_level_irq_event( 413 &mut self, 414 irq: u32, 415 irq_event: &IrqLevelEvent, 416 source: IrqEventSource, 417 ) -> Result<Option<IrqEventIndex>> { 418 self.register_irq_event( 419 irq, 420 irq_event.get_trigger(), 421 Some(irq_event.get_resample()), 422 source, 423 ) 424 } 425 unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()>426 fn unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()> { 427 self.unregister_irq_event(irq, irq_event.get_trigger()) 428 } 429 route_irq(&mut self, route: IrqRoute) -> Result<()>430 fn route_irq(&mut self, route: IrqRoute) -> Result<()> { 431 self.routes.lock().add(route) 432 } 433 set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()>434 fn set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()> { 435 self.routes.lock().replace_all(routes) 436 } 437 irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>>438 fn irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>> { 439 let mut tokens: Vec<(IrqEventIndex, IrqEventSource, Event)> = Vec::new(); 440 for (index, evt) in self.irq_events.lock().iter().enumerate() { 441 if let Some(evt) = evt { 442 tokens.push((index, evt.source.clone(), evt.event.try_clone()?)); 443 } 444 } 445 Ok(tokens) 446 } 447 service_irq(&mut self, irq: u32, level: bool) -> Result<()>448 fn service_irq(&mut self, irq: u32, level: bool) -> Result<()> { 449 for route in self.routes.lock()[irq as usize].iter() { 450 match *route { 451 IrqSource::Irqchip { 452 chip: IrqSourceChip::PicPrimary, 453 pin, 454 } 455 | IrqSource::Irqchip { 456 chip: IrqSourceChip::PicSecondary, 457 pin, 458 } => { 459 self.pic.lock().service_irq(pin as u8, level); 460 } 461 IrqSource::Irqchip { 462 chip: IrqSourceChip::Ioapic, 463 pin, 464 } => { 465 self.ioapic.lock().service_irq(pin as usize, level); 466 } 467 // service_irq's level parameter is ignored for MSIs. MSI data specifies the level. 468 IrqSource::Msi { address, data } => self.send_msi(address as u32, data), 469 _ => { 470 error!("Unexpected route source {:?}", route); 471 return Err(Error::new(libc::EINVAL)); 472 } 473 } 474 } 475 Ok(()) 476 } 477 478 /// Services an IRQ event by asserting then deasserting an IRQ line. The associated Event 479 /// that triggered the irq event will be read from. If the irq is associated with a resample 480 /// Event, then the deassert will only happen after an EOI is broadcast for a vector 481 /// associated with the irq line. 482 /// For UserspaceIrqChip, this function identifies the destination(s) of the irq: PIC, IOAPIC, 483 /// or APIC (MSI). If it's a PIC or IOAPIC route, we attempt to call service_irq on those 484 /// chips. If the IOAPIC is unable to be immediately locked, we add the irq to the 485 /// delayed_ioapic_irq_events (though we still read from the Event that triggered the irq 486 /// event). If it's an MSI route, we call send_msi to decode the MSI and send it to the 487 /// destination APIC(s). service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()>488 fn service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()> { 489 let irq_events = self.irq_events.lock(); 490 let evt = if let Some(evt) = &irq_events[event_index] { 491 evt 492 } else { 493 return Ok(()); 494 }; 495 evt.event.wait()?; 496 497 for route in self.routes.lock()[evt.gsi as usize].iter() { 498 match *route { 499 IrqSource::Irqchip { 500 chip: IrqSourceChip::PicPrimary, 501 pin, 502 } 503 | IrqSource::Irqchip { 504 chip: IrqSourceChip::PicSecondary, 505 pin, 506 } => { 507 let mut pic = self.pic.lock(); 508 if evt.resample_event.is_some() { 509 pic.service_irq(pin as u8, true); 510 } else { 511 pic.service_irq(pin as u8, true); 512 pic.service_irq(pin as u8, false); 513 } 514 } 515 IrqSource::Irqchip { 516 chip: IrqSourceChip::Ioapic, 517 pin, 518 } => { 519 if let Ok(mut ioapic) = self.ioapic.try_lock() { 520 if evt.resample_event.is_some() { 521 ioapic.service_irq(pin as usize, true); 522 } else { 523 ioapic.service_irq(pin as usize, true); 524 ioapic.service_irq(pin as usize, false); 525 } 526 } else { 527 let mut delayed_events = self.delayed_ioapic_irq_events.lock(); 528 delayed_events.events.push(event_index); 529 delayed_events.trigger.signal().unwrap(); 530 } 531 } 532 IrqSource::Msi { address, data } => self.send_msi(address as u32, data), 533 _ => { 534 error!("Unexpected route source {:?}", route); 535 return Err(Error::new(libc::EINVAL)); 536 } 537 } 538 } 539 540 Ok(()) 541 } 542 543 /// Broadcasts an end of interrupt. For UserspaceIrqChip this sends the EOI to the ioapic. broadcast_eoi(&self, vector: u8) -> Result<()>544 fn broadcast_eoi(&self, vector: u8) -> Result<()> { 545 self.ioapic.lock().end_of_interrupt(vector); 546 Ok(()) 547 } 548 549 /// Injects any pending interrupts for `vcpu`. 550 /// 551 /// For UserspaceIrqChip this: 552 /// * Injects a PIC interrupt, if vcpu_id is 0 and vcpu is ready for interrupt 553 /// * Injects an APIC fixed interrupt, if vcpu is ready for interrupt and PIC didn't inject 554 /// * Injects APIC NMIs 555 /// * Handles APIC INIT IPIs 556 /// * Handles APIC SIPIs 557 /// * Requests an interrupt window, if PIC or APIC still has pending interrupts for this vcpu inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()>558 fn inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()> { 559 let vcpu: &V = vcpu 560 .downcast_ref() 561 .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type"); 562 let vcpu_id = vcpu.id(); 563 let mut vcpu_ready = vcpu.ready_for_interrupt(); 564 565 let mut pic_needs_window = false; 566 if vcpu_id == 0 { 567 let mut pic = self.pic.lock(); 568 if vcpu_ready { 569 if let Some(vector) = pic.get_external_interrupt() { 570 vcpu.interrupt(vector)?; 571 self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable); 572 // Already injected a PIC interrupt, so APIC fixed interrupt can't be injected. 573 vcpu_ready = false; 574 } 575 } 576 pic_needs_window = pic.interrupt_requested(); 577 } 578 579 let irqs = self.apics[vcpu_id].lock().get_pending_irqs(vcpu_ready); 580 if let Some(vector) = irqs.fixed { 581 let do_interrupt = { 582 let mut apic = self.apics[vcpu_id].lock(); 583 match apic.get_mp_state() { 584 MPState::Runnable | MPState::Halted => { 585 // APIC interrupts should only be injectable when the MPState is 586 // Halted or Runnable. 587 apic.set_mp_state(&MPState::Runnable); 588 true 589 } 590 s => { 591 // This shouldn't happen, but log a helpful error if it does. 592 error!("Interrupt cannot be injected while in state: {:?}", s); 593 false 594 } 595 } 596 }; 597 598 if do_interrupt { 599 vcpu.interrupt(vector)?; 600 } 601 } 602 for _ in 0..irqs.nmis { 603 let prev_state = self.apics[vcpu_id].lock().get_mp_state(); 604 vcpu.inject_nmi()?; 605 self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable); 606 info!( 607 "Delivered NMI to cpu {}, mp_state was {:?}, now is {:?}", 608 vcpu_id, 609 prev_state, 610 MPState::Runnable 611 ); 612 } 613 if irqs.init { 614 { 615 let mut apic = self.apics[vcpu_id].lock(); 616 apic.load_reset_state(); 617 apic.set_mp_state(&MPState::InitReceived); 618 } 619 info!("Delivered INIT IPI to cpu {}", vcpu_id); 620 } 621 if let Some(vector) = irqs.startup { 622 // If our state is not MPState::InitReceived then this is probably 623 // the second SIPI in the INIT-SIPI-SIPI sequence; ignore. 624 if self.apics[vcpu_id].lock().get_mp_state() == MPState::InitReceived { 625 self.deliver_startup(vcpu, vector)?; 626 self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable); 627 info!("Delivered SIPI to cpu {}", vcpu_id); 628 } 629 } 630 631 let needs_window = pic_needs_window || irqs.needs_window; 632 vcpu.set_interrupt_window_requested(needs_window); 633 634 Ok(()) 635 } 636 637 /// Notifies the irq chip that the specified VCPU has executed a halt instruction. 638 /// For `UserspaceIrqChip`, it sets the APIC's mp_state to `MPState::Halted`. halted(&self, vcpu_id: usize)639 fn halted(&self, vcpu_id: usize) { 640 self.apics[vcpu_id].lock().set_mp_state(&MPState::Halted) 641 } 642 643 /// Blocks until `vcpu` is in a runnable state or until interrupted by 644 /// `IrqChip::kick_halted_vcpus`. Returns `VcpuRunState::Runnable if vcpu is runnable, or 645 /// `VcpuRunState::Interrupted` if the wait was interrupted. 646 /// For `UserspaceIrqChip`, if the APIC isn't `MPState::Runnable`, sleep until there are new 647 /// interrupts pending on the APIC, inject the interrupts, and go back to sleep if still not 648 /// runnable. wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState>649 fn wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState> { 650 let vcpu_id = vcpu.id(); 651 let waiter = &self.waiters[vcpu_id]; 652 let mut interrupted_lock = waiter.mtx.lock(); 653 loop { 654 if *interrupted_lock { 655 *interrupted_lock = false; 656 info!("wait_until_runnable interrupted on cpu {}", vcpu_id); 657 return Ok(VcpuRunState::Interrupted); 658 } 659 if self.is_runnable(vcpu_id) { 660 return Ok(VcpuRunState::Runnable); 661 } 662 663 self.inject_interrupts(vcpu)?; 664 if self.is_runnable(vcpu_id) { 665 return Ok(VcpuRunState::Runnable); 666 } 667 interrupted_lock = waiter.cvar.wait(interrupted_lock); 668 } 669 } 670 671 /// Makes unrunnable VCPUs return immediately from `wait_until_runnable`. 672 /// For UserspaceIrqChip, every vcpu gets kicked so its current or next call to 673 /// `wait_until_runnable` will immediately return false. After that one kick, subsequent 674 /// `wait_until_runnable` calls go back to waiting for runnability normally. kick_halted_vcpus(&self)675 fn kick_halted_vcpus(&self) { 676 for waiter in self.waiters.iter() { 677 waiter.set_and_notify(/* interrupted= */ true); 678 } 679 } 680 get_mp_state(&self, vcpu_id: usize) -> Result<MPState>681 fn get_mp_state(&self, vcpu_id: usize) -> Result<MPState> { 682 Ok(self.apics[vcpu_id].lock().get_mp_state()) 683 } 684 set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()>685 fn set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()> { 686 self.apics[vcpu_id].lock().set_mp_state(state); 687 Ok(()) 688 } 689 try_clone(&self) -> Result<Self>690 fn try_clone(&self) -> Result<Self> { 691 // kill_evts and timer_descriptors don't change, so they could be a plain Vec with each 692 // element cloned. But the Arc<Mutex> avoids a quadratic number of open descriptors from 693 // cloning, and those fields aren't performance critical. 694 Ok(UserspaceIrqChip { 695 vcpus: self.vcpus.clone(), 696 waiters: self.waiters.clone(), 697 routes: self.routes.clone(), 698 pit: self.pit.clone(), 699 pic: self.pic.clone(), 700 ioapic: self.ioapic.clone(), 701 ioapic_pins: self.ioapic_pins, 702 apics: self.apics.clone(), 703 timer_descriptors: self.timer_descriptors.clone(), 704 delayed_ioapic_irq_events: self.delayed_ioapic_irq_events.clone(), 705 irq_events: self.irq_events.clone(), 706 dropper: self.dropper.clone(), 707 activated: self.activated, 708 }) 709 } 710 711 // TODO(srichman): factor out UserspaceIrqChip and KvmSplitIrqChip::finalize_devices finalize_devices( &mut self, resources: &mut SystemAllocator, io_bus: &Bus, mmio_bus: &Bus, ) -> Result<()>712 fn finalize_devices( 713 &mut self, 714 resources: &mut SystemAllocator, 715 io_bus: &Bus, 716 mmio_bus: &Bus, 717 ) -> Result<()> { 718 // Insert pit into io_bus 719 io_bus.insert(self.pit.clone(), 0x040, 0x8).unwrap(); 720 io_bus.insert(self.pit.clone(), 0x061, 0x1).unwrap(); 721 722 // Insert pic into io_bus 723 io_bus.insert(self.pic.clone(), 0x20, 0x2).unwrap(); 724 io_bus.insert(self.pic.clone(), 0xa0, 0x2).unwrap(); 725 io_bus.insert(self.pic.clone(), 0x4d0, 0x2).unwrap(); 726 727 // Insert ioapic into mmio_bus 728 mmio_bus 729 .insert( 730 self.ioapic.clone(), 731 IOAPIC_BASE_ADDRESS, 732 IOAPIC_MEM_LENGTH_BYTES, 733 ) 734 .unwrap(); 735 736 // Insert self into mmio_bus for handling APIC mmio 737 mmio_bus 738 .insert_sync( 739 Arc::new(self.try_clone()?), 740 APIC_BASE_ADDRESS, 741 APIC_MEM_LENGTH_BYTES, 742 ) 743 .unwrap(); 744 745 // At this point, all of our devices have been created and they have registered their 746 // irq events, so we can clone our resample events 747 let mut ioapic_resample_events: Vec<Vec<Event>> = 748 (0..self.ioapic_pins).map(|_| Vec::new()).collect(); 749 let mut pic_resample_events: Vec<Vec<Event>> = 750 (0..self.ioapic_pins).map(|_| Vec::new()).collect(); 751 752 for evt in self.irq_events.lock().iter().flatten() { 753 if (evt.gsi as usize) >= self.ioapic_pins { 754 continue; 755 } 756 if let Some(resample_evt) = &evt.resample_event { 757 ioapic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?); 758 pic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?); 759 } 760 } 761 762 // Register resample events with the ioapic 763 self.ioapic 764 .lock() 765 .register_resample_events(ioapic_resample_events); 766 // Register resample events with the pic 767 self.pic 768 .lock() 769 .register_resample_events(pic_resample_events); 770 771 // Make sure all future irq numbers are >= self.ioapic_pins 772 let mut irq_num = resources.allocate_irq().unwrap(); 773 while irq_num < self.ioapic_pins as u32 { 774 irq_num = resources.allocate_irq().unwrap(); 775 } 776 777 // Spawn timer threads here instead of in new(), in case crosvm is in sandbox mode. 778 self.activated = true; 779 let _ = self.wake(); 780 781 Ok(()) 782 } 783 784 /// The UserspaceIrqChip's ioapic may be locked because a vcpu thread is currently writing to 785 /// the ioapic, and the ioapic may be blocking on adding MSI routes, which requires blocking 786 /// tube communication back to the main thread. Thus, we do not want the main thread to 787 /// block on a locked ioapic, so any irqs that could not be serviced because the ioapic could 788 /// not be immediately locked are added to the delayed_ioapic_irq_events Vec. This function 789 /// processes each delayed event in the vec each time it's called. If the ioapic is still 790 /// locked, we keep the queued irqs for the next time this function is called. process_delayed_irq_events(&mut self) -> Result<()>791 fn process_delayed_irq_events(&mut self) -> Result<()> { 792 let irq_events = self.irq_events.lock(); 793 let mut delayed_events = self.delayed_ioapic_irq_events.lock(); 794 delayed_events.events.retain(|&event_index| { 795 if let Some(evt) = &irq_events[event_index] { 796 if let Ok(mut ioapic) = self.ioapic.try_lock() { 797 if evt.resample_event.is_some() { 798 ioapic.service_irq(evt.gsi as usize, true); 799 } else { 800 ioapic.service_irq(evt.gsi as usize, true); 801 ioapic.service_irq(evt.gsi as usize, false); 802 } 803 804 false 805 } else { 806 true 807 } 808 } else { 809 true 810 } 811 }); 812 813 if delayed_events.events.is_empty() { 814 delayed_events.trigger.wait()?; 815 } 816 Ok(()) 817 } 818 irq_delayed_event_token(&self) -> Result<Option<Event>>819 fn irq_delayed_event_token(&self) -> Result<Option<Event>> { 820 Ok(Some( 821 self.delayed_ioapic_irq_events.lock().trigger.try_clone()?, 822 )) 823 } 824 check_capability(&self, c: IrqChipCap) -> bool825 fn check_capability(&self, c: IrqChipCap) -> bool { 826 match c { 827 IrqChipCap::TscDeadlineTimer => false, 828 IrqChipCap::X2Apic => false, 829 IrqChipCap::MpStateGetSet => true, 830 } 831 } 832 } 833 834 impl<V: VcpuX86_64 + 'static> BusDevice for UserspaceIrqChip<V> { debug_label(&self) -> String835 fn debug_label(&self) -> String { 836 "UserspaceIrqChip APIC".to_string() 837 } device_id(&self) -> DeviceId838 fn device_id(&self) -> DeviceId { 839 CrosvmDeviceId::UserspaceIrqChip.into() 840 } 841 } 842 843 impl<V: VcpuX86_64 + 'static> Suspendable for UserspaceIrqChip<V> { sleep(&mut self) -> anyhow::Result<()>844 fn sleep(&mut self) -> anyhow::Result<()> { 845 let mut dropper = self.dropper.lock(); 846 dropper.sleep() 847 } 848 wake(&mut self) -> anyhow::Result<()>849 fn wake(&mut self) -> anyhow::Result<()> { 850 if self.activated { 851 // create workers and run them. 852 let mut dropper = self.dropper.lock(); 853 for (i, descriptor) in self.timer_descriptors.iter().enumerate() { 854 let mut worker = TimerWorker { 855 id: i, 856 apic: self.apics[i].clone(), 857 descriptor: *descriptor, 858 vcpus: self.vcpus.clone(), 859 waiter: self.waiters[i].clone(), 860 }; 861 let worker_thread = WorkerThread::start( 862 format!("UserspaceIrqChip timer worker {}", i), 863 move |evt| worker.run(evt), 864 ); 865 dropper.workers.push(worker_thread); 866 } 867 } 868 Ok(()) 869 } 870 } 871 872 impl<V: VcpuX86_64 + 'static> BusDeviceSync for UserspaceIrqChip<V> { read(&self, info: BusAccessInfo, data: &mut [u8])873 fn read(&self, info: BusAccessInfo, data: &mut [u8]) { 874 self.apics[info.id].lock().read(info.offset, data) 875 } write(&self, info: BusAccessInfo, data: &[u8])876 fn write(&self, info: BusAccessInfo, data: &[u8]) { 877 let msg = self.apics[info.id].lock().write(info.offset, data); 878 if let Some(m) = msg { 879 self.handle_msg(m); 880 } 881 } 882 } 883 884 impl<V: VcpuX86_64 + 'static> IrqChipX86_64 for UserspaceIrqChip<V> { try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>>885 fn try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>> { 886 Ok(Box::new(self.try_clone()?)) 887 } 888 as_irq_chip(&self) -> &dyn IrqChip889 fn as_irq_chip(&self) -> &dyn IrqChip { 890 self 891 } 892 as_irq_chip_mut(&mut self) -> &mut dyn IrqChip893 fn as_irq_chip_mut(&mut self) -> &mut dyn IrqChip { 894 self 895 } 896 get_pic_state(&self, select: PicSelect) -> Result<PicState>897 fn get_pic_state(&self, select: PicSelect) -> Result<PicState> { 898 Ok(self.pic.lock().get_pic_state(select)) 899 } 900 set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()>901 fn set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()> { 902 self.pic.lock().set_pic_state(select, state); 903 Ok(()) 904 } 905 get_ioapic_state(&self) -> Result<IoapicState>906 fn get_ioapic_state(&self) -> Result<IoapicState> { 907 Ok(self.ioapic.lock().get_ioapic_state()) 908 } 909 set_ioapic_state(&mut self, state: &IoapicState) -> Result<()>910 fn set_ioapic_state(&mut self, state: &IoapicState) -> Result<()> { 911 self.ioapic.lock().set_ioapic_state(state); 912 Ok(()) 913 } 914 get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState>915 fn get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState> { 916 Ok(self.apics[vcpu_id].lock().get_state()) 917 } 918 set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()>919 fn set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()> { 920 self.apics[vcpu_id].lock().set_state(state); 921 Ok(()) 922 } 923 924 /// Get the lapic frequency in Hz lapic_frequency(&self) -> u32925 fn lapic_frequency(&self) -> u32 { 926 Apic::frequency() 927 } 928 get_pit(&self) -> Result<PitState>929 fn get_pit(&self) -> Result<PitState> { 930 Ok(self.pit.lock().get_pit_state()) 931 } 932 set_pit(&mut self, state: &PitState) -> Result<()>933 fn set_pit(&mut self, state: &PitState) -> Result<()> { 934 self.pit.lock().set_pit_state(state); 935 Ok(()) 936 } 937 938 /// Returns true if the PIT uses port 0x61 for the PC speaker, false if 0x61 is unused. 939 /// devices::Pit uses 0x61. pit_uses_speaker_port(&self) -> bool940 fn pit_uses_speaker_port(&self) -> bool { 941 true 942 } 943 snapshot_chip_specific(&self) -> anyhow::Result<AnySnapshot>944 fn snapshot_chip_specific(&self) -> anyhow::Result<AnySnapshot> { 945 Err(anyhow::anyhow!("Not supported yet in userspace")) 946 } restore_chip_specific(&mut self, _data: AnySnapshot) -> anyhow::Result<()>947 fn restore_chip_specific(&mut self, _data: AnySnapshot) -> anyhow::Result<()> { 948 Err(anyhow::anyhow!("Not supported yet in userspace")) 949 } 950 } 951 952 /// Condition variable used by `UserspaceIrqChip::wait_until_runnable`. 953 #[derive(Default)] 954 struct Waiter { 955 // mtx stores an "interrupted" bool that's true if `kick_halted_vcpus` has been called. 956 mtx: Mutex<bool>, 957 cvar: Condvar, 958 } 959 960 impl Waiter { 961 /// Wakes up `wait_until_runnable` to recheck the interrupted flag and vcpu runnable state. notify(&self)962 pub fn notify(&self) { 963 let _lock = self.mtx.lock(); 964 self.cvar.notify_all(); 965 } 966 967 /// Sets the interrupted flag, and wakes up `wait_until_runnable` to recheck the interrupted 968 /// flag and vcpu runnable state. If `interrupted` is true, then `wait_until_runnable` should 969 /// stop waiting for a runnable vcpu and return immediately. set_and_notify(&self, interrupted: bool)970 pub fn set_and_notify(&self, interrupted: bool) { 971 let mut interrupted_lock = self.mtx.lock(); 972 *interrupted_lock = interrupted; 973 self.cvar.notify_all(); 974 } 975 } 976 977 /// Worker thread for polling timer events and sending them to an APIC. 978 struct TimerWorker<V: VcpuX86_64> { 979 id: usize, 980 apic: Arc<Mutex<Apic>>, 981 vcpus: Arc<Mutex<Vec<Option<V>>>>, 982 descriptor: Descriptor, 983 waiter: Arc<Waiter>, 984 } 985 986 impl<V: VcpuX86_64> TimerWorker<V> { run(&mut self, kill_evt: Event) -> TimerWorkerResult<()>987 fn run(&mut self, kill_evt: Event) -> TimerWorkerResult<()> { 988 #[derive(EventToken)] 989 enum Token { 990 // The timer expired. 991 TimerExpire, 992 // The parent thread requested an exit. 993 Kill, 994 } 995 996 let wait_ctx: WaitContext<Token> = WaitContext::build_with(&[ 997 (&self.descriptor, Token::TimerExpire), 998 (&kill_evt, Token::Kill), 999 ]) 1000 .map_err(TimerWorkerError::CreateWaitContext)?; 1001 1002 loop { 1003 let events = wait_ctx.wait().map_err(TimerWorkerError::WaitError)?; 1004 for event in events.iter().filter(|e| e.is_readable) { 1005 match event.token { 1006 Token::TimerExpire => { 1007 self.apic.lock().handle_timer_expiration(); 1008 if let Some(Some(vcpu)) = self.vcpus.lock().get(self.id) { 1009 vcpu.set_interrupt_window_requested(true); 1010 } 1011 self.waiter.notify(); 1012 } 1013 Token::Kill => return Ok(()), 1014 } 1015 } 1016 } 1017 } 1018 } 1019 1020 #[derive(Debug)] 1021 enum TimerWorkerError { 1022 /// Creating WaitContext failed. 1023 CreateWaitContext(Error), 1024 /// Error while waiting for events. 1025 WaitError(Error), 1026 } 1027 1028 impl Display for TimerWorkerError { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1029 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 1030 use self::TimerWorkerError::*; 1031 1032 match self { 1033 CreateWaitContext(e) => write!(f, "failed to create event context: {}", e), 1034 WaitError(e) => write!(f, "failed to wait for events: {}", e), 1035 } 1036 } 1037 } 1038 1039 impl std::error::Error for TimerWorkerError {} 1040 1041 type TimerWorkerResult<T> = std::result::Result<T, TimerWorkerError>; 1042