• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::convert::TryFrom;
6 use std::convert::TryInto;
7 use std::fmt;
8 use std::fmt::Display;
9 use std::iter;
10 use std::sync::Arc;
11 
12 cfg_if::cfg_if! {
13     if #[cfg(test)] {
14         use base::{FakeClock as Clock, FakeTimer as Timer};
15     } else {
16         use base::{Clock, Timer};
17     }
18 }
19 use anyhow::Context;
20 use base::error;
21 use base::info;
22 use base::warn;
23 use base::AsRawDescriptor;
24 use base::Descriptor;
25 use base::Error;
26 use base::Event;
27 use base::EventToken;
28 use base::Result;
29 use base::Tube;
30 use base::WaitContext;
31 use base::WorkerThread;
32 use hypervisor::DeliveryMode;
33 use hypervisor::IoapicState;
34 use hypervisor::IrqRoute;
35 use hypervisor::IrqSource;
36 use hypervisor::IrqSourceChip;
37 use hypervisor::LapicState;
38 use hypervisor::MPState;
39 use hypervisor::MsiAddressMessage;
40 use hypervisor::MsiDataMessage;
41 use hypervisor::PicSelect;
42 use hypervisor::PicState;
43 use hypervisor::PitState;
44 use hypervisor::Vcpu;
45 use hypervisor::VcpuX86_64;
46 use resources::SystemAllocator;
47 use snapshot::AnySnapshot;
48 use sync::Condvar;
49 use sync::Mutex;
50 
51 use crate::bus::BusDeviceSync;
52 use crate::irqchip::Apic;
53 use crate::irqchip::ApicBusMsg;
54 use crate::irqchip::DelayedIoApicIrqEvents;
55 use crate::irqchip::Interrupt;
56 use crate::irqchip::InterruptData;
57 use crate::irqchip::InterruptDestination;
58 use crate::irqchip::Ioapic;
59 use crate::irqchip::IrqEvent;
60 use crate::irqchip::IrqEventIndex;
61 use crate::irqchip::Pic;
62 use crate::irqchip::Routes;
63 use crate::irqchip::VcpuRunState;
64 use crate::irqchip::APIC_BASE_ADDRESS;
65 use crate::irqchip::APIC_MEM_LENGTH_BYTES;
66 use crate::irqchip::IOAPIC_BASE_ADDRESS;
67 use crate::irqchip::IOAPIC_MEM_LENGTH_BYTES;
68 use crate::pci::CrosvmDeviceId;
69 use crate::Bus;
70 use crate::BusAccessInfo;
71 use crate::BusDevice;
72 use crate::DeviceId;
73 use crate::IrqChip;
74 use crate::IrqChipCap;
75 use crate::IrqChipX86_64;
76 use crate::IrqEdgeEvent;
77 use crate::IrqEventSource;
78 use crate::IrqLevelEvent;
79 use crate::Pit;
80 use crate::PitError;
81 use crate::Suspendable;
82 
83 /// PIT channel 0 timer is connected to IRQ 0
84 const PIT_CHANNEL0_IRQ: u32 = 0;
85 /// CR0 extension type bit
86 const X86_CR0_ET: u64 = 0x00000010;
87 /// CR0 not write through bit
88 const X86_CR0_NW: u64 = 0x20000000;
89 /// CR0 cache disable bit
90 const X86_CR0_CD: u64 = 0x40000000;
91 /// Default power on state of CR0 register, according to the Intel manual.
92 const X86_CR0_INIT: u64 = X86_CR0_ET | X86_CR0_NW | X86_CR0_CD;
93 
94 /// An `IrqChip` with all interrupt devices emulated in userspace.  `UserspaceIrqChip` works with
95 /// any hypervisor, but only supports x86.
96 pub struct UserspaceIrqChip<V: VcpuX86_64> {
97     pub vcpus: Arc<Mutex<Vec<Option<V>>>>,
98     routes: Arc<Mutex<Routes>>,
99     pit: Arc<Mutex<Pit>>,
100     pic: Arc<Mutex<Pic>>,
101     ioapic: Arc<Mutex<Ioapic>>,
102     ioapic_pins: usize,
103     pub apics: Vec<Arc<Mutex<Apic>>>,
104     // Condition variables used by wait_until_runnable.
105     waiters: Vec<Arc<Waiter>>,
106     // Raw descriptors of the apic Timers.
107     timer_descriptors: Vec<Descriptor>,
108     /// Delayed ioapic irq object, that contains the delayed events because the ioapic was locked
109     /// when service_irq was called on the irqchip. This prevents deadlocks when a Vcpu thread has
110     /// locked the ioapic and the ioapic sends a AddMsiRoute signal to the main thread (which
111     /// itself may be busy trying to call service_irq).
112     ///
113     /// ## Note:
114     /// This lock may be locked by itself to access the `DelayedIoApicIrqEvents`. If accessed in
115     /// conjunction with the `irq_events` field, that lock should be taken first to prevent
116     /// deadlocks stemming from lock-ordering issues.
117     delayed_ioapic_irq_events: Arc<Mutex<DelayedIoApicIrqEvents>>,
118     // Array of Events that devices will use to assert ioapic pins.
119     irq_events: Arc<Mutex<Vec<Option<IrqEvent>>>>,
120     dropper: Arc<Mutex<Dropper>>,
121     activated: bool,
122 }
123 
124 /// Helper that implements `Drop` on behalf of `UserspaceIrqChip`.  The many cloned copies of an irq
125 /// chip share a single arc'ed `Dropper`, which only runs its drop when the last irq chip copy is
126 /// dropped.
127 struct Dropper {
128     /// Worker threads that deliver timer events to the APICs.
129     workers: Vec<WorkerThread<TimerWorkerResult<()>>>,
130 }
131 
132 impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> {
133     /// Constructs a new `UserspaceIrqChip`.
new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self>134     pub fn new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self> {
135         let clock = Arc::new(Mutex::new(Clock::new()));
136         Self::new_with_clock(num_vcpus, irq_tube, ioapic_pins, clock)
137     }
138 
139     /// Constructs a new `UserspaceIrqChip`, with a clock.  Used for testing.
new_with_clock( num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>, clock: Arc<Mutex<Clock>>, ) -> Result<Self>140     pub fn new_with_clock(
141         num_vcpus: usize,
142         irq_tube: Tube,
143         ioapic_pins: Option<usize>,
144         clock: Arc<Mutex<Clock>>,
145     ) -> Result<Self> {
146         let pit_evt = IrqEdgeEvent::new()?;
147         // For test only, this clock instance is FakeClock. It needs to be cloned for every Timer
148         // instance, so make a clone for it now.
149         #[cfg(test)]
150         let test_clock = clock.clone();
151         let pit = Pit::new(pit_evt.try_clone()?, clock).map_err(|e| match e {
152             PitError::CloneEvent(err) => err,
153             PitError::CreateEvent(err) => err,
154             PitError::CreateWaitContext(err) => err,
155             PitError::TimerCreateError(err) => err,
156             PitError::WaitError(err) => err,
157             PitError::SpawnThread(_) => Error::new(libc::EIO),
158         })?;
159         let pit_event_source = IrqEventSource::from_device(&pit);
160 
161         let ioapic_pins = ioapic_pins.unwrap_or(hypervisor::NUM_IOAPIC_PINS);
162         let ioapic = Ioapic::new(irq_tube, ioapic_pins)?;
163 
164         let mut timer_descriptors: Vec<Descriptor> = Vec::with_capacity(num_vcpus);
165         let mut apics: Vec<Arc<Mutex<Apic>>> = Vec::with_capacity(num_vcpus);
166         for id in 0..num_vcpus {
167             cfg_if::cfg_if! {
168                 if #[cfg(test)] {
169                     let timer = Timer::new(test_clock.clone());
170                 } else {
171                     let timer = Timer::new()?;
172                 }
173             }
174             // Timers are owned by the apics, which outlive the raw descriptors stored here and in
175             // the worker threads.
176             timer_descriptors.push(Descriptor(timer.as_raw_descriptor()));
177 
178             let id: u8 = id.try_into().or(Err(Error::new(libc::EINVAL)))?;
179             let apic = Apic::new(id, Box::new(timer));
180             apics.push(Arc::new(Mutex::new(apic)));
181         }
182         let dropper = Dropper {
183             workers: Vec::new(),
184         };
185 
186         let mut chip = UserspaceIrqChip {
187             vcpus: Arc::new(Mutex::new(
188                 iter::repeat_with(|| None).take(num_vcpus).collect(),
189             )),
190             waiters: iter::repeat_with(Default::default)
191                 .take(num_vcpus)
192                 .collect(),
193             routes: Arc::new(Mutex::new(Routes::new())),
194             pit: Arc::new(Mutex::new(pit)),
195             pic: Arc::new(Mutex::new(Pic::new())),
196             ioapic: Arc::new(Mutex::new(ioapic)),
197             ioapic_pins,
198             apics,
199             timer_descriptors,
200             delayed_ioapic_irq_events: Arc::new(Mutex::new(DelayedIoApicIrqEvents::new()?)),
201             irq_events: Arc::new(Mutex::new(Vec::new())),
202             dropper: Arc::new(Mutex::new(dropper)),
203             activated: false,
204         };
205 
206         // Setup standard x86 irq routes
207         chip.set_irq_routes(&Routes::default_pic_ioapic_routes(ioapic_pins))?;
208 
209         chip.register_edge_irq_event(PIT_CHANNEL0_IRQ, &pit_evt, pit_event_source)?;
210         Ok(chip)
211     }
212 
213     /// Handles a message from an APIC.
handle_msg(&self, msg: ApicBusMsg)214     fn handle_msg(&self, msg: ApicBusMsg) {
215         match msg {
216             ApicBusMsg::Eoi(vector) => {
217                 let _ = self.broadcast_eoi(vector);
218             }
219             ApicBusMsg::Ipi(interrupt) => self.send_irq_to_apics(&interrupt),
220         }
221     }
222 
223     /// Sends a Message Signaled Interrupt to one or more APICs.  MSIs are a 64-bit address and
224     /// 32-bit data, but in the Intel spec we're implementing, only the low 32 bits of the address
225     /// are used.
send_msi(&self, addr: u32, data: u32)226     fn send_msi(&self, addr: u32, data: u32) {
227         let mut msi_addr = MsiAddressMessage::new();
228         msi_addr.set(0, 32, addr as u64);
229         let dest = match InterruptDestination::try_from(&msi_addr) {
230             Ok(dest) => dest,
231             Err(e) => {
232                 warn!("Invalid MSI message: {}", e);
233                 return;
234             }
235         };
236 
237         let mut msi_data = MsiDataMessage::new();
238         msi_data.set(0, 32, data as u64);
239         let data = InterruptData::from(&msi_data);
240 
241         self.send_irq_to_apics(&Interrupt { dest, data });
242     }
243 
send_irq_to_apic(&self, id: usize, irq: &InterruptData)244     pub fn send_irq_to_apic(&self, id: usize, irq: &InterruptData) {
245         // id can come from the guest, so check bounds.
246         if let Some(apic) = self.apics.get(id) {
247             apic.lock().accept_irq(irq);
248         } else {
249             error!("Interrupt for non-existent apic {}: {:?}", id, irq);
250         }
251         if let Some(Some(vcpu)) = self.vcpus.lock().get(id) {
252             vcpu.set_interrupt_window_requested(true);
253         } else {
254             error!("Interrupt for non-existent vcpu {}: {:?}", id, irq);
255         }
256         self.waiters[id].notify();
257     }
258 
259     /// Sends an interrupt to one or more APICs.  Used for sending MSIs and IPIs.
send_irq_to_apics(&self, irq: &Interrupt)260     pub fn send_irq_to_apics(&self, irq: &Interrupt) {
261         match irq.data.delivery {
262             DeliveryMode::Fixed | DeliveryMode::Lowest | DeliveryMode::RemoteRead => {}
263             _ => info!("UserspaceIrqChip received special irq: {:?}", irq),
264         }
265 
266         // First try the fast path, where the destination is a single APIC we can send to directly.
267         if let Some(apic_id) = Apic::single_dest_fast(&irq.dest) {
268             self.send_irq_to_apic(apic_id as usize, &irq.data);
269             return;
270         }
271 
272         let lowest_mode = irq.data.delivery == DeliveryMode::Lowest;
273         let mut lowest_priority = u8::MAX;
274         let mut lowest_apic: Option<usize> = None;
275 
276         for (i, apic) in self.apics.iter().enumerate() {
277             let send = {
278                 let apic = apic.lock();
279                 if !apic.match_dest(&irq.dest) {
280                     false
281                 } else if lowest_mode {
282                     let priority = apic.get_processor_priority();
283                     if priority <= lowest_priority {
284                         lowest_priority = priority;
285                         lowest_apic = Some(i);
286                     }
287                     false
288                 } else {
289                     true
290                 }
291             };
292             if send {
293                 self.send_irq_to_apic(i, &irq.data);
294             }
295         }
296 
297         if lowest_mode {
298             if let Some(index) = lowest_apic {
299                 self.send_irq_to_apic(index, &irq.data);
300             } else {
301                 // According to sections 10.6.2.1 and 10.6.2.2 of the SDM, the OS should not let
302                 // this happen.  If the OS is misconfigured then drop the interrupt and log a
303                 // warning.
304                 warn!(
305                     "Lowest priority interrupt sent, but no apics configured as valid target: {:?}",
306                     irq
307                 );
308             }
309         }
310     }
311 
312     /// Delivers a startup IPI to `vcpu`.
deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()>313     fn deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()> {
314         // This comes from Intel SDM volume 3, chapter 8.4.  The vector specifies a page aligned
315         // address where execution should start.  cs.base is the offset for the code segment with an
316         // RIP of 0.  The cs.selector is just the base shifted right by 4 bits.
317         let mut sregs = vcpu.get_sregs()?;
318         sregs.cs.base = (vector as u64) << 12;
319         sregs.cs.selector = (vector as u16) << 8;
320 
321         // Set CR0 to its INIT value per the manual.  Application processors won't boot with the CR0
322         // protected mode and paging bits set by setup_sregs().  Kernel APIC doesn't have this
323         // issue, probably because it uses MSRs instead of MMIO, so it's less affected when the AP's
324         // state (CR3 etc.) doesn't reflect changes that Linux made while booting vcpu 0.
325         sregs.cr0 = X86_CR0_INIT;
326         vcpu.set_sregs(&sregs)?;
327 
328         let mut regs = vcpu.get_regs()?;
329         regs.rip = 0;
330         vcpu.set_regs(&regs)?;
331 
332         Ok(())
333     }
334 
335     /// Checks if the specified VCPU is in a runnable state.
is_runnable(&self, vcpu_id: usize) -> bool336     fn is_runnable(&self, vcpu_id: usize) -> bool {
337         self.apics[vcpu_id].lock().get_mp_state() == MPState::Runnable
338     }
339 }
340 
341 impl Dropper {
sleep(&mut self) -> anyhow::Result<()>342     fn sleep(&mut self) -> anyhow::Result<()> {
343         for thread in self.workers.split_off(0).into_iter() {
344             thread
345                 .stop()
346                 .context("UserspaceIrqChip worker thread exited with error")?;
347         }
348         Ok(())
349     }
350 }
351 
352 impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> {
register_irq_event( &mut self, irq: u32, irq_event: &Event, resample_event: Option<&Event>, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>353     fn register_irq_event(
354         &mut self,
355         irq: u32,
356         irq_event: &Event,
357         resample_event: Option<&Event>,
358         source: IrqEventSource,
359     ) -> Result<Option<IrqEventIndex>> {
360         let mut evt = IrqEvent {
361             gsi: irq,
362             event: irq_event.try_clone()?,
363             resample_event: None,
364             source,
365         };
366         if let Some(resample_event) = resample_event {
367             evt.resample_event = Some(resample_event.try_clone()?);
368         }
369 
370         let mut irq_events = self.irq_events.lock();
371         let index = irq_events.len();
372         irq_events.push(Some(evt));
373         Ok(Some(index))
374     }
375 
unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()>376     fn unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()> {
377         let mut irq_events = self.irq_events.lock();
378         for (index, evt) in irq_events.iter().enumerate() {
379             if let Some(evt) = evt {
380                 if evt.gsi == irq && irq_event.eq(&evt.event) {
381                     irq_events[index] = None;
382                     break;
383                 }
384             }
385         }
386         Ok(())
387     }
388 }
389 
390 impl<V: VcpuX86_64 + 'static> IrqChip for UserspaceIrqChip<V> {
add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()>391     fn add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()> {
392         let vcpu: &V = vcpu
393             .downcast_ref()
394             .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type");
395         self.vcpus.lock()[vcpu_id] = Some(vcpu.try_clone()?);
396         Ok(())
397     }
398 
register_edge_irq_event( &mut self, irq: u32, irq_event: &IrqEdgeEvent, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>399     fn register_edge_irq_event(
400         &mut self,
401         irq: u32,
402         irq_event: &IrqEdgeEvent,
403         source: IrqEventSource,
404     ) -> Result<Option<IrqEventIndex>> {
405         self.register_irq_event(irq, irq_event.get_trigger(), None, source)
406     }
407 
unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()>408     fn unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()> {
409         self.unregister_irq_event(irq, irq_event.get_trigger())
410     }
411 
register_level_irq_event( &mut self, irq: u32, irq_event: &IrqLevelEvent, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>412     fn register_level_irq_event(
413         &mut self,
414         irq: u32,
415         irq_event: &IrqLevelEvent,
416         source: IrqEventSource,
417     ) -> Result<Option<IrqEventIndex>> {
418         self.register_irq_event(
419             irq,
420             irq_event.get_trigger(),
421             Some(irq_event.get_resample()),
422             source,
423         )
424     }
425 
unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()>426     fn unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()> {
427         self.unregister_irq_event(irq, irq_event.get_trigger())
428     }
429 
route_irq(&mut self, route: IrqRoute) -> Result<()>430     fn route_irq(&mut self, route: IrqRoute) -> Result<()> {
431         self.routes.lock().add(route)
432     }
433 
set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()>434     fn set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()> {
435         self.routes.lock().replace_all(routes)
436     }
437 
irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>>438     fn irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>> {
439         let mut tokens: Vec<(IrqEventIndex, IrqEventSource, Event)> = Vec::new();
440         for (index, evt) in self.irq_events.lock().iter().enumerate() {
441             if let Some(evt) = evt {
442                 tokens.push((index, evt.source.clone(), evt.event.try_clone()?));
443             }
444         }
445         Ok(tokens)
446     }
447 
service_irq(&mut self, irq: u32, level: bool) -> Result<()>448     fn service_irq(&mut self, irq: u32, level: bool) -> Result<()> {
449         for route in self.routes.lock()[irq as usize].iter() {
450             match *route {
451                 IrqSource::Irqchip {
452                     chip: IrqSourceChip::PicPrimary,
453                     pin,
454                 }
455                 | IrqSource::Irqchip {
456                     chip: IrqSourceChip::PicSecondary,
457                     pin,
458                 } => {
459                     self.pic.lock().service_irq(pin as u8, level);
460                 }
461                 IrqSource::Irqchip {
462                     chip: IrqSourceChip::Ioapic,
463                     pin,
464                 } => {
465                     self.ioapic.lock().service_irq(pin as usize, level);
466                 }
467                 // service_irq's level parameter is ignored for MSIs.  MSI data specifies the level.
468                 IrqSource::Msi { address, data } => self.send_msi(address as u32, data),
469                 _ => {
470                     error!("Unexpected route source {:?}", route);
471                     return Err(Error::new(libc::EINVAL));
472                 }
473             }
474         }
475         Ok(())
476     }
477 
478     /// Services an IRQ event by asserting then deasserting an IRQ line.  The associated Event
479     /// that triggered the irq event will be read from.  If the irq is associated with a resample
480     /// Event, then the deassert will only happen after an EOI is broadcast for a vector
481     /// associated with the irq line.
482     /// For UserspaceIrqChip, this function identifies the destination(s) of the irq: PIC, IOAPIC,
483     /// or APIC (MSI).  If it's a PIC or IOAPIC route, we attempt to call service_irq on those
484     /// chips.  If the IOAPIC is unable to be immediately locked, we add the irq to the
485     /// delayed_ioapic_irq_events (though we still read from the Event that triggered the irq
486     /// event).  If it's an MSI route, we call send_msi to decode the MSI and send it to the
487     /// destination APIC(s).
service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()>488     fn service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()> {
489         let irq_events = self.irq_events.lock();
490         let evt = if let Some(evt) = &irq_events[event_index] {
491             evt
492         } else {
493             return Ok(());
494         };
495         evt.event.wait()?;
496 
497         for route in self.routes.lock()[evt.gsi as usize].iter() {
498             match *route {
499                 IrqSource::Irqchip {
500                     chip: IrqSourceChip::PicPrimary,
501                     pin,
502                 }
503                 | IrqSource::Irqchip {
504                     chip: IrqSourceChip::PicSecondary,
505                     pin,
506                 } => {
507                     let mut pic = self.pic.lock();
508                     if evt.resample_event.is_some() {
509                         pic.service_irq(pin as u8, true);
510                     } else {
511                         pic.service_irq(pin as u8, true);
512                         pic.service_irq(pin as u8, false);
513                     }
514                 }
515                 IrqSource::Irqchip {
516                     chip: IrqSourceChip::Ioapic,
517                     pin,
518                 } => {
519                     if let Ok(mut ioapic) = self.ioapic.try_lock() {
520                         if evt.resample_event.is_some() {
521                             ioapic.service_irq(pin as usize, true);
522                         } else {
523                             ioapic.service_irq(pin as usize, true);
524                             ioapic.service_irq(pin as usize, false);
525                         }
526                     } else {
527                         let mut delayed_events = self.delayed_ioapic_irq_events.lock();
528                         delayed_events.events.push(event_index);
529                         delayed_events.trigger.signal().unwrap();
530                     }
531                 }
532                 IrqSource::Msi { address, data } => self.send_msi(address as u32, data),
533                 _ => {
534                     error!("Unexpected route source {:?}", route);
535                     return Err(Error::new(libc::EINVAL));
536                 }
537             }
538         }
539 
540         Ok(())
541     }
542 
543     /// Broadcasts an end of interrupt.  For UserspaceIrqChip this sends the EOI to the ioapic.
broadcast_eoi(&self, vector: u8) -> Result<()>544     fn broadcast_eoi(&self, vector: u8) -> Result<()> {
545         self.ioapic.lock().end_of_interrupt(vector);
546         Ok(())
547     }
548 
549     /// Injects any pending interrupts for `vcpu`.
550     ///
551     /// For UserspaceIrqChip this:
552     ///   * Injects a PIC interrupt, if vcpu_id is 0 and vcpu is ready for interrupt
553     ///   * Injects an APIC fixed interrupt, if vcpu is ready for interrupt and PIC didn't inject
554     ///   * Injects APIC NMIs
555     ///   * Handles APIC INIT IPIs
556     ///   * Handles APIC SIPIs
557     ///   * Requests an interrupt window, if PIC or APIC still has pending interrupts for this vcpu
inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()>558     fn inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()> {
559         let vcpu: &V = vcpu
560             .downcast_ref()
561             .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type");
562         let vcpu_id = vcpu.id();
563         let mut vcpu_ready = vcpu.ready_for_interrupt();
564 
565         let mut pic_needs_window = false;
566         if vcpu_id == 0 {
567             let mut pic = self.pic.lock();
568             if vcpu_ready {
569                 if let Some(vector) = pic.get_external_interrupt() {
570                     vcpu.interrupt(vector)?;
571                     self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
572                     // Already injected a PIC interrupt, so APIC fixed interrupt can't be injected.
573                     vcpu_ready = false;
574                 }
575             }
576             pic_needs_window = pic.interrupt_requested();
577         }
578 
579         let irqs = self.apics[vcpu_id].lock().get_pending_irqs(vcpu_ready);
580         if let Some(vector) = irqs.fixed {
581             let do_interrupt = {
582                 let mut apic = self.apics[vcpu_id].lock();
583                 match apic.get_mp_state() {
584                     MPState::Runnable | MPState::Halted => {
585                         // APIC interrupts should only be injectable when the MPState is
586                         // Halted or Runnable.
587                         apic.set_mp_state(&MPState::Runnable);
588                         true
589                     }
590                     s => {
591                         // This shouldn't happen, but log a helpful error if it does.
592                         error!("Interrupt cannot be injected while in state: {:?}", s);
593                         false
594                     }
595                 }
596             };
597 
598             if do_interrupt {
599                 vcpu.interrupt(vector)?;
600             }
601         }
602         for _ in 0..irqs.nmis {
603             let prev_state = self.apics[vcpu_id].lock().get_mp_state();
604             vcpu.inject_nmi()?;
605             self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
606             info!(
607                 "Delivered NMI to cpu {}, mp_state was {:?}, now is {:?}",
608                 vcpu_id,
609                 prev_state,
610                 MPState::Runnable
611             );
612         }
613         if irqs.init {
614             {
615                 let mut apic = self.apics[vcpu_id].lock();
616                 apic.load_reset_state();
617                 apic.set_mp_state(&MPState::InitReceived);
618             }
619             info!("Delivered INIT IPI to cpu {}", vcpu_id);
620         }
621         if let Some(vector) = irqs.startup {
622             // If our state is not MPState::InitReceived then this is probably
623             // the second SIPI in the INIT-SIPI-SIPI sequence; ignore.
624             if self.apics[vcpu_id].lock().get_mp_state() == MPState::InitReceived {
625                 self.deliver_startup(vcpu, vector)?;
626                 self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
627                 info!("Delivered SIPI to cpu {}", vcpu_id);
628             }
629         }
630 
631         let needs_window = pic_needs_window || irqs.needs_window;
632         vcpu.set_interrupt_window_requested(needs_window);
633 
634         Ok(())
635     }
636 
637     /// Notifies the irq chip that the specified VCPU has executed a halt instruction.
638     /// For `UserspaceIrqChip`, it sets the APIC's mp_state to `MPState::Halted`.
halted(&self, vcpu_id: usize)639     fn halted(&self, vcpu_id: usize) {
640         self.apics[vcpu_id].lock().set_mp_state(&MPState::Halted)
641     }
642 
643     /// Blocks until `vcpu` is in a runnable state or until interrupted by
644     /// `IrqChip::kick_halted_vcpus`.  Returns `VcpuRunState::Runnable if vcpu is runnable, or
645     /// `VcpuRunState::Interrupted` if the wait was interrupted.
646     /// For `UserspaceIrqChip`, if the APIC isn't `MPState::Runnable`, sleep until there are new
647     /// interrupts pending on the APIC, inject the interrupts, and go back to sleep if still not
648     /// runnable.
wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState>649     fn wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState> {
650         let vcpu_id = vcpu.id();
651         let waiter = &self.waiters[vcpu_id];
652         let mut interrupted_lock = waiter.mtx.lock();
653         loop {
654             if *interrupted_lock {
655                 *interrupted_lock = false;
656                 info!("wait_until_runnable interrupted on cpu {}", vcpu_id);
657                 return Ok(VcpuRunState::Interrupted);
658             }
659             if self.is_runnable(vcpu_id) {
660                 return Ok(VcpuRunState::Runnable);
661             }
662 
663             self.inject_interrupts(vcpu)?;
664             if self.is_runnable(vcpu_id) {
665                 return Ok(VcpuRunState::Runnable);
666             }
667             interrupted_lock = waiter.cvar.wait(interrupted_lock);
668         }
669     }
670 
671     /// Makes unrunnable VCPUs return immediately from `wait_until_runnable`.
672     /// For UserspaceIrqChip, every vcpu gets kicked so its current or next call to
673     /// `wait_until_runnable` will immediately return false.  After that one kick, subsequent
674     /// `wait_until_runnable` calls go back to waiting for runnability normally.
kick_halted_vcpus(&self)675     fn kick_halted_vcpus(&self) {
676         for waiter in self.waiters.iter() {
677             waiter.set_and_notify(/* interrupted= */ true);
678         }
679     }
680 
get_mp_state(&self, vcpu_id: usize) -> Result<MPState>681     fn get_mp_state(&self, vcpu_id: usize) -> Result<MPState> {
682         Ok(self.apics[vcpu_id].lock().get_mp_state())
683     }
684 
set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()>685     fn set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()> {
686         self.apics[vcpu_id].lock().set_mp_state(state);
687         Ok(())
688     }
689 
try_clone(&self) -> Result<Self>690     fn try_clone(&self) -> Result<Self> {
691         // kill_evts and timer_descriptors don't change, so they could be a plain Vec with each
692         // element cloned.  But the Arc<Mutex> avoids a quadratic number of open descriptors from
693         // cloning, and those fields aren't performance critical.
694         Ok(UserspaceIrqChip {
695             vcpus: self.vcpus.clone(),
696             waiters: self.waiters.clone(),
697             routes: self.routes.clone(),
698             pit: self.pit.clone(),
699             pic: self.pic.clone(),
700             ioapic: self.ioapic.clone(),
701             ioapic_pins: self.ioapic_pins,
702             apics: self.apics.clone(),
703             timer_descriptors: self.timer_descriptors.clone(),
704             delayed_ioapic_irq_events: self.delayed_ioapic_irq_events.clone(),
705             irq_events: self.irq_events.clone(),
706             dropper: self.dropper.clone(),
707             activated: self.activated,
708         })
709     }
710 
711     // TODO(srichman): factor out UserspaceIrqChip and KvmSplitIrqChip::finalize_devices
finalize_devices( &mut self, resources: &mut SystemAllocator, io_bus: &Bus, mmio_bus: &Bus, ) -> Result<()>712     fn finalize_devices(
713         &mut self,
714         resources: &mut SystemAllocator,
715         io_bus: &Bus,
716         mmio_bus: &Bus,
717     ) -> Result<()> {
718         // Insert pit into io_bus
719         io_bus.insert(self.pit.clone(), 0x040, 0x8).unwrap();
720         io_bus.insert(self.pit.clone(), 0x061, 0x1).unwrap();
721 
722         // Insert pic into io_bus
723         io_bus.insert(self.pic.clone(), 0x20, 0x2).unwrap();
724         io_bus.insert(self.pic.clone(), 0xa0, 0x2).unwrap();
725         io_bus.insert(self.pic.clone(), 0x4d0, 0x2).unwrap();
726 
727         // Insert ioapic into mmio_bus
728         mmio_bus
729             .insert(
730                 self.ioapic.clone(),
731                 IOAPIC_BASE_ADDRESS,
732                 IOAPIC_MEM_LENGTH_BYTES,
733             )
734             .unwrap();
735 
736         // Insert self into mmio_bus for handling APIC mmio
737         mmio_bus
738             .insert_sync(
739                 Arc::new(self.try_clone()?),
740                 APIC_BASE_ADDRESS,
741                 APIC_MEM_LENGTH_BYTES,
742             )
743             .unwrap();
744 
745         // At this point, all of our devices have been created and they have registered their
746         // irq events, so we can clone our resample events
747         let mut ioapic_resample_events: Vec<Vec<Event>> =
748             (0..self.ioapic_pins).map(|_| Vec::new()).collect();
749         let mut pic_resample_events: Vec<Vec<Event>> =
750             (0..self.ioapic_pins).map(|_| Vec::new()).collect();
751 
752         for evt in self.irq_events.lock().iter().flatten() {
753             if (evt.gsi as usize) >= self.ioapic_pins {
754                 continue;
755             }
756             if let Some(resample_evt) = &evt.resample_event {
757                 ioapic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
758                 pic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
759             }
760         }
761 
762         // Register resample events with the ioapic
763         self.ioapic
764             .lock()
765             .register_resample_events(ioapic_resample_events);
766         // Register resample events with the pic
767         self.pic
768             .lock()
769             .register_resample_events(pic_resample_events);
770 
771         // Make sure all future irq numbers are >= self.ioapic_pins
772         let mut irq_num = resources.allocate_irq().unwrap();
773         while irq_num < self.ioapic_pins as u32 {
774             irq_num = resources.allocate_irq().unwrap();
775         }
776 
777         // Spawn timer threads here instead of in new(), in case crosvm is in sandbox mode.
778         self.activated = true;
779         let _ = self.wake();
780 
781         Ok(())
782     }
783 
784     /// The UserspaceIrqChip's ioapic may be locked because a vcpu thread is currently writing to
785     /// the ioapic, and the ioapic may be blocking on adding MSI routes, which requires blocking
786     /// tube communication back to the main thread.  Thus, we do not want the main thread to
787     /// block on a locked ioapic, so any irqs that could not be serviced because the ioapic could
788     /// not be immediately locked are added to the delayed_ioapic_irq_events Vec. This function
789     /// processes each delayed event in the vec each time it's called. If the ioapic is still
790     /// locked, we keep the queued irqs for the next time this function is called.
process_delayed_irq_events(&mut self) -> Result<()>791     fn process_delayed_irq_events(&mut self) -> Result<()> {
792         let irq_events = self.irq_events.lock();
793         let mut delayed_events = self.delayed_ioapic_irq_events.lock();
794         delayed_events.events.retain(|&event_index| {
795             if let Some(evt) = &irq_events[event_index] {
796                 if let Ok(mut ioapic) = self.ioapic.try_lock() {
797                     if evt.resample_event.is_some() {
798                         ioapic.service_irq(evt.gsi as usize, true);
799                     } else {
800                         ioapic.service_irq(evt.gsi as usize, true);
801                         ioapic.service_irq(evt.gsi as usize, false);
802                     }
803 
804                     false
805                 } else {
806                     true
807                 }
808             } else {
809                 true
810             }
811         });
812 
813         if delayed_events.events.is_empty() {
814             delayed_events.trigger.wait()?;
815         }
816         Ok(())
817     }
818 
irq_delayed_event_token(&self) -> Result<Option<Event>>819     fn irq_delayed_event_token(&self) -> Result<Option<Event>> {
820         Ok(Some(
821             self.delayed_ioapic_irq_events.lock().trigger.try_clone()?,
822         ))
823     }
824 
check_capability(&self, c: IrqChipCap) -> bool825     fn check_capability(&self, c: IrqChipCap) -> bool {
826         match c {
827             IrqChipCap::TscDeadlineTimer => false,
828             IrqChipCap::X2Apic => false,
829             IrqChipCap::MpStateGetSet => true,
830         }
831     }
832 }
833 
834 impl<V: VcpuX86_64 + 'static> BusDevice for UserspaceIrqChip<V> {
debug_label(&self) -> String835     fn debug_label(&self) -> String {
836         "UserspaceIrqChip APIC".to_string()
837     }
device_id(&self) -> DeviceId838     fn device_id(&self) -> DeviceId {
839         CrosvmDeviceId::UserspaceIrqChip.into()
840     }
841 }
842 
843 impl<V: VcpuX86_64 + 'static> Suspendable for UserspaceIrqChip<V> {
sleep(&mut self) -> anyhow::Result<()>844     fn sleep(&mut self) -> anyhow::Result<()> {
845         let mut dropper = self.dropper.lock();
846         dropper.sleep()
847     }
848 
wake(&mut self) -> anyhow::Result<()>849     fn wake(&mut self) -> anyhow::Result<()> {
850         if self.activated {
851             // create workers and run them.
852             let mut dropper = self.dropper.lock();
853             for (i, descriptor) in self.timer_descriptors.iter().enumerate() {
854                 let mut worker = TimerWorker {
855                     id: i,
856                     apic: self.apics[i].clone(),
857                     descriptor: *descriptor,
858                     vcpus: self.vcpus.clone(),
859                     waiter: self.waiters[i].clone(),
860                 };
861                 let worker_thread = WorkerThread::start(
862                     format!("UserspaceIrqChip timer worker {}", i),
863                     move |evt| worker.run(evt),
864                 );
865                 dropper.workers.push(worker_thread);
866             }
867         }
868         Ok(())
869     }
870 }
871 
872 impl<V: VcpuX86_64 + 'static> BusDeviceSync for UserspaceIrqChip<V> {
read(&self, info: BusAccessInfo, data: &mut [u8])873     fn read(&self, info: BusAccessInfo, data: &mut [u8]) {
874         self.apics[info.id].lock().read(info.offset, data)
875     }
write(&self, info: BusAccessInfo, data: &[u8])876     fn write(&self, info: BusAccessInfo, data: &[u8]) {
877         let msg = self.apics[info.id].lock().write(info.offset, data);
878         if let Some(m) = msg {
879             self.handle_msg(m);
880         }
881     }
882 }
883 
884 impl<V: VcpuX86_64 + 'static> IrqChipX86_64 for UserspaceIrqChip<V> {
try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>>885     fn try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>> {
886         Ok(Box::new(self.try_clone()?))
887     }
888 
as_irq_chip(&self) -> &dyn IrqChip889     fn as_irq_chip(&self) -> &dyn IrqChip {
890         self
891     }
892 
as_irq_chip_mut(&mut self) -> &mut dyn IrqChip893     fn as_irq_chip_mut(&mut self) -> &mut dyn IrqChip {
894         self
895     }
896 
get_pic_state(&self, select: PicSelect) -> Result<PicState>897     fn get_pic_state(&self, select: PicSelect) -> Result<PicState> {
898         Ok(self.pic.lock().get_pic_state(select))
899     }
900 
set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()>901     fn set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()> {
902         self.pic.lock().set_pic_state(select, state);
903         Ok(())
904     }
905 
get_ioapic_state(&self) -> Result<IoapicState>906     fn get_ioapic_state(&self) -> Result<IoapicState> {
907         Ok(self.ioapic.lock().get_ioapic_state())
908     }
909 
set_ioapic_state(&mut self, state: &IoapicState) -> Result<()>910     fn set_ioapic_state(&mut self, state: &IoapicState) -> Result<()> {
911         self.ioapic.lock().set_ioapic_state(state);
912         Ok(())
913     }
914 
get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState>915     fn get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState> {
916         Ok(self.apics[vcpu_id].lock().get_state())
917     }
918 
set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()>919     fn set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()> {
920         self.apics[vcpu_id].lock().set_state(state);
921         Ok(())
922     }
923 
924     /// Get the lapic frequency in Hz
lapic_frequency(&self) -> u32925     fn lapic_frequency(&self) -> u32 {
926         Apic::frequency()
927     }
928 
get_pit(&self) -> Result<PitState>929     fn get_pit(&self) -> Result<PitState> {
930         Ok(self.pit.lock().get_pit_state())
931     }
932 
set_pit(&mut self, state: &PitState) -> Result<()>933     fn set_pit(&mut self, state: &PitState) -> Result<()> {
934         self.pit.lock().set_pit_state(state);
935         Ok(())
936     }
937 
938     /// Returns true if the PIT uses port 0x61 for the PC speaker, false if 0x61 is unused.
939     /// devices::Pit uses 0x61.
pit_uses_speaker_port(&self) -> bool940     fn pit_uses_speaker_port(&self) -> bool {
941         true
942     }
943 
snapshot_chip_specific(&self) -> anyhow::Result<AnySnapshot>944     fn snapshot_chip_specific(&self) -> anyhow::Result<AnySnapshot> {
945         Err(anyhow::anyhow!("Not supported yet in userspace"))
946     }
restore_chip_specific(&mut self, _data: AnySnapshot) -> anyhow::Result<()>947     fn restore_chip_specific(&mut self, _data: AnySnapshot) -> anyhow::Result<()> {
948         Err(anyhow::anyhow!("Not supported yet in userspace"))
949     }
950 }
951 
952 /// Condition variable used by `UserspaceIrqChip::wait_until_runnable`.
953 #[derive(Default)]
954 struct Waiter {
955     // mtx stores an "interrupted" bool that's true if `kick_halted_vcpus` has been called.
956     mtx: Mutex<bool>,
957     cvar: Condvar,
958 }
959 
960 impl Waiter {
961     /// Wakes up `wait_until_runnable` to recheck the interrupted flag and vcpu runnable state.
notify(&self)962     pub fn notify(&self) {
963         let _lock = self.mtx.lock();
964         self.cvar.notify_all();
965     }
966 
967     /// Sets the interrupted flag, and wakes up `wait_until_runnable` to recheck the interrupted
968     /// flag and vcpu runnable state.  If `interrupted` is true, then `wait_until_runnable` should
969     /// stop waiting for a runnable vcpu and return immediately.
set_and_notify(&self, interrupted: bool)970     pub fn set_and_notify(&self, interrupted: bool) {
971         let mut interrupted_lock = self.mtx.lock();
972         *interrupted_lock = interrupted;
973         self.cvar.notify_all();
974     }
975 }
976 
977 /// Worker thread for polling timer events and sending them to an APIC.
978 struct TimerWorker<V: VcpuX86_64> {
979     id: usize,
980     apic: Arc<Mutex<Apic>>,
981     vcpus: Arc<Mutex<Vec<Option<V>>>>,
982     descriptor: Descriptor,
983     waiter: Arc<Waiter>,
984 }
985 
986 impl<V: VcpuX86_64> TimerWorker<V> {
run(&mut self, kill_evt: Event) -> TimerWorkerResult<()>987     fn run(&mut self, kill_evt: Event) -> TimerWorkerResult<()> {
988         #[derive(EventToken)]
989         enum Token {
990             // The timer expired.
991             TimerExpire,
992             // The parent thread requested an exit.
993             Kill,
994         }
995 
996         let wait_ctx: WaitContext<Token> = WaitContext::build_with(&[
997             (&self.descriptor, Token::TimerExpire),
998             (&kill_evt, Token::Kill),
999         ])
1000         .map_err(TimerWorkerError::CreateWaitContext)?;
1001 
1002         loop {
1003             let events = wait_ctx.wait().map_err(TimerWorkerError::WaitError)?;
1004             for event in events.iter().filter(|e| e.is_readable) {
1005                 match event.token {
1006                     Token::TimerExpire => {
1007                         self.apic.lock().handle_timer_expiration();
1008                         if let Some(Some(vcpu)) = self.vcpus.lock().get(self.id) {
1009                             vcpu.set_interrupt_window_requested(true);
1010                         }
1011                         self.waiter.notify();
1012                     }
1013                     Token::Kill => return Ok(()),
1014                 }
1015             }
1016         }
1017     }
1018 }
1019 
1020 #[derive(Debug)]
1021 enum TimerWorkerError {
1022     /// Creating WaitContext failed.
1023     CreateWaitContext(Error),
1024     /// Error while waiting for events.
1025     WaitError(Error),
1026 }
1027 
1028 impl Display for TimerWorkerError {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1029     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1030         use self::TimerWorkerError::*;
1031 
1032         match self {
1033             CreateWaitContext(e) => write!(f, "failed to create event context: {}", e),
1034             WaitError(e) => write!(f, "failed to wait for events: {}", e),
1035         }
1036     }
1037 }
1038 
1039 impl std::error::Error for TimerWorkerError {}
1040 
1041 type TimerWorkerResult<T> = std::result::Result<T, TimerWorkerError>;
1042