• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::cmp::max;
6 use std::cmp::Reverse;
7 use std::collections::BTreeMap;
8 use std::collections::BTreeSet;
9 use std::fs;
10 use std::path::Path;
11 use std::path::PathBuf;
12 use std::str::FromStr;
13 use std::sync::Arc;
14 use std::u32;
15 
16 use acpi_tables::aml::Aml;
17 use base::debug;
18 use base::error;
19 use base::pagesize;
20 use base::warn;
21 use base::AsRawDescriptor;
22 use base::AsRawDescriptors;
23 use base::Event;
24 use base::EventToken;
25 use base::MemoryMapping;
26 use base::Protection;
27 use base::RawDescriptor;
28 use base::Tube;
29 use base::WaitContext;
30 use base::WorkerThread;
31 use hypervisor::MemCacheType;
32 use resources::AddressRange;
33 use resources::Alloc;
34 use resources::AllocOptions;
35 use resources::MmioType;
36 use resources::SystemAllocator;
37 use sync::Mutex;
38 use vfio_sys::vfio::VFIO_PCI_ACPI_NTFY_IRQ_INDEX;
39 use vfio_sys::*;
40 use vm_control::api::VmMemoryClient;
41 use vm_control::HotPlugDeviceInfo;
42 use vm_control::HotPlugDeviceType;
43 use vm_control::VmMemoryDestination;
44 use vm_control::VmMemoryRegionId;
45 use vm_control::VmMemorySource;
46 use vm_control::VmRequest;
47 use vm_control::VmResponse;
48 
49 use crate::pci::acpi::DeviceVcfgRegister;
50 use crate::pci::acpi::DsmMethod;
51 use crate::pci::acpi::PowerResourceMethod;
52 use crate::pci::acpi::SHM_OFFSET;
53 use crate::pci::msi::MsiConfig;
54 use crate::pci::msi::MsiStatus;
55 use crate::pci::msi::PCI_MSI_FLAGS;
56 use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
57 use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
58 use crate::pci::msi::PCI_MSI_NEXT_POINTER;
59 use crate::pci::msix::MsixConfig;
60 use crate::pci::msix::MsixStatus;
61 use crate::pci::msix::BITS_PER_PBA_ENTRY;
62 use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
63 use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
64 use crate::pci::pci_device::BarRange;
65 use crate::pci::pci_device::Error as PciDeviceError;
66 use crate::pci::pci_device::PciDevice;
67 use crate::pci::pci_device::PreferredIrq;
68 use crate::pci::pm::PciPmCap;
69 use crate::pci::pm::PmConfig;
70 use crate::pci::pm::PM_CAP_LENGTH;
71 use crate::pci::PciAddress;
72 use crate::pci::PciBarConfiguration;
73 use crate::pci::PciBarIndex;
74 use crate::pci::PciBarPrefetchable;
75 use crate::pci::PciBarRegionType;
76 use crate::pci::PciCapabilityID;
77 use crate::pci::PciClassCode;
78 use crate::pci::PciId;
79 use crate::pci::PciInterruptPin;
80 use crate::pci::PCI_VCFG_DSM;
81 use crate::pci::PCI_VCFG_NOTY;
82 use crate::pci::PCI_VCFG_PM;
83 use crate::pci::PCI_VENDOR_ID_INTEL;
84 use crate::vfio::VfioDevice;
85 use crate::vfio::VfioError;
86 use crate::vfio::VfioIrqType;
87 use crate::vfio::VfioPciConfig;
88 use crate::IrqLevelEvent;
89 use crate::Suspendable;
90 
91 const PCI_VENDOR_ID: u32 = 0x0;
92 const PCI_DEVICE_ID: u32 = 0x2;
93 const PCI_COMMAND: u32 = 0x4;
94 const PCI_COMMAND_MEMORY: u8 = 0x2;
95 const PCI_BASE_CLASS_CODE: u32 = 0x0B;
96 const PCI_INTERRUPT_NUM: u32 = 0x3C;
97 const PCI_INTERRUPT_PIN: u32 = 0x3D;
98 
99 const PCI_CAPABILITY_LIST: u32 = 0x34;
100 const PCI_CAP_ID_MSI: u8 = 0x05;
101 const PCI_CAP_ID_MSIX: u8 = 0x11;
102 const PCI_CAP_ID_PM: u8 = 0x01;
103 
104 // Size of the standard PCI config space
105 const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
106 // Size of the standard PCIe config space: 4KB
107 const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
108 
109 // Extended Capabilities
110 const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
111 const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
112 const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
113 const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
114 
115 struct VfioPmCap {
116     offset: u32,
117     capabilities: u32,
118     config: PmConfig,
119 }
120 
121 impl VfioPmCap {
new(config: &VfioPciConfig, cap_start: u32) -> Self122     fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
123         let mut capabilities: u32 = config.read_config(cap_start);
124         capabilities |= (PciPmCap::default_cap() as u32) << 16;
125         VfioPmCap {
126             offset: cap_start,
127             capabilities,
128             config: PmConfig::new(false),
129         }
130     }
131 
should_trigger_pme(&mut self) -> bool132     pub fn should_trigger_pme(&mut self) -> bool {
133         self.config.should_trigger_pme()
134     }
135 
is_pm_reg(&self, offset: u32) -> bool136     fn is_pm_reg(&self, offset: u32) -> bool {
137         (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
138     }
139 
read(&self, offset: u32) -> u32140     pub fn read(&self, offset: u32) -> u32 {
141         let offset = offset - self.offset;
142         if offset == 0 {
143             self.capabilities
144         } else {
145             let mut data = 0;
146             self.config.read(&mut data);
147             data
148         }
149     }
150 
write(&mut self, offset: u64, data: &[u8])151     pub fn write(&mut self, offset: u64, data: &[u8]) {
152         let offset = offset - self.offset as u64;
153         if offset >= std::mem::size_of::<u32>() as u64 {
154             let offset = offset - std::mem::size_of::<u32>() as u64;
155             self.config.write(offset, data);
156         }
157     }
158 }
159 
160 enum VfioMsiChange {
161     Disable,
162     Enable,
163     FunctionChanged,
164 }
165 
166 struct VfioMsiCap {
167     config: MsiConfig,
168     offset: u32,
169 }
170 
171 impl VfioMsiCap {
new( config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube, device_id: u32, device_name: String, ) -> Self172     fn new(
173         config: &VfioPciConfig,
174         msi_cap_start: u32,
175         vm_socket_irq: Tube,
176         device_id: u32,
177         device_name: String,
178     ) -> Self {
179         let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
180         let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
181         let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
182 
183         VfioMsiCap {
184             config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
185             offset: msi_cap_start,
186         }
187     }
188 
is_msi_reg(&self, index: u64, len: usize) -> bool189     fn is_msi_reg(&self, index: u64, len: usize) -> bool {
190         self.config.is_msi_reg(self.offset, index, len)
191     }
192 
write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>193     fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
194         let offset = index as u32 - self.offset;
195         match self.config.write_msi_capability(offset, data) {
196             MsiStatus::Enabled => Some(VfioMsiChange::Enable),
197             MsiStatus::Disabled => Some(VfioMsiChange::Disable),
198             MsiStatus::NothingToDo => None,
199         }
200     }
201 
get_msi_irqfd(&self) -> Option<&Event>202     fn get_msi_irqfd(&self) -> Option<&Event> {
203         self.config.get_irqfd()
204     }
205 
destroy(&mut self)206     fn destroy(&mut self) {
207         self.config.destroy()
208     }
209 }
210 
211 // MSI-X registers in MSI-X capability
212 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
213 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
214 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
215 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
216 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
217 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
218 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
219 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
220 
221 struct VfioMsixCap {
222     config: MsixConfig,
223     offset: u32,
224     table_size: u16,
225     table_pci_bar: PciBarIndex,
226     table_offset: u64,
227     table_size_bytes: u64,
228     pba_pci_bar: PciBarIndex,
229     pba_offset: u64,
230     pba_size_bytes: u64,
231     msix_interrupt_evt: Vec<Event>,
232 }
233 
234 impl VfioMsixCap {
new( config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube, pci_id: u32, device_name: String, ) -> Self235     fn new(
236         config: &VfioPciConfig,
237         msix_cap_start: u32,
238         vm_socket_irq: Tube,
239         pci_id: u32,
240         device_name: String,
241     ) -> Self {
242         let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
243         let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
244         let table_pci_bar = (table & PCI_MSIX_TABLE_BIR) as PciBarIndex;
245         let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
246         let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
247         let pba_pci_bar = (pba & PCI_MSIX_PBA_BIR) as PciBarIndex;
248         let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
249 
250         let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
251         if table_pci_bar == pba_pci_bar
252             && pba_offset > table_offset
253             && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
254         {
255             table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
256         }
257 
258         let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
259         let pba_size_bytes = ((table_size + BITS_PER_PBA_ENTRY as u64 - 1)
260             / BITS_PER_PBA_ENTRY as u64)
261             * MSIX_PBA_ENTRIES_MODULO;
262         let mut msix_interrupt_evt = Vec::new();
263         for _ in 0..table_size {
264             msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
265         }
266         VfioMsixCap {
267             config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
268             offset: msix_cap_start,
269             table_size: table_size as u16,
270             table_pci_bar,
271             table_offset,
272             table_size_bytes,
273             pba_pci_bar,
274             pba_offset,
275             pba_size_bytes,
276             msix_interrupt_evt,
277         }
278     }
279 
280     // only msix control register is writable and need special handle in pci r/w
is_msix_control_reg(&self, offset: u32, size: u32) -> bool281     fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
282         let control_start = self.offset + PCI_MSIX_FLAGS;
283         let control_end = control_start + 2;
284 
285         offset < control_end && offset + size > control_start
286     }
287 
read_msix_control(&self, data: &mut u32)288     fn read_msix_control(&self, data: &mut u32) {
289         *data = self.config.read_msix_capability(*data);
290     }
291 
write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>292     fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
293         let old_enabled = self.config.enabled();
294         let old_masked = self.config.masked();
295 
296         self.config
297             .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
298 
299         let new_enabled = self.config.enabled();
300         let new_masked = self.config.masked();
301 
302         if !old_enabled && new_enabled {
303             Some(VfioMsiChange::Enable)
304         } else if old_enabled && !new_enabled {
305             Some(VfioMsiChange::Disable)
306         } else if new_enabled && old_masked != new_masked {
307             Some(VfioMsiChange::FunctionChanged)
308         } else {
309             None
310         }
311     }
312 
is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool313     fn is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool {
314         bar_index == self.table_pci_bar
315             && offset >= self.table_offset
316             && offset < self.table_offset + self.table_size_bytes
317     }
318 
get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange>319     fn get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
320         if bar_index == self.table_pci_bar {
321             AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
322         } else {
323             None
324         }
325     }
326 
read_table(&self, offset: u64, data: &mut [u8])327     fn read_table(&self, offset: u64, data: &mut [u8]) {
328         let offset = offset - self.table_offset;
329         self.config.read_msix_table(offset, data);
330     }
331 
write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus332     fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
333         let offset = offset - self.table_offset;
334         self.config.write_msix_table(offset, data)
335     }
336 
is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool337     fn is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool {
338         bar_index == self.pba_pci_bar
339             && offset >= self.pba_offset
340             && offset < self.pba_offset + self.pba_size_bytes
341     }
342 
get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange>343     fn get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
344         if bar_index == self.pba_pci_bar {
345             AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
346         } else {
347             None
348         }
349     }
350 
read_pba(&self, offset: u64, data: &mut [u8])351     fn read_pba(&self, offset: u64, data: &mut [u8]) {
352         let offset = offset - self.pba_offset;
353         self.config.read_pba_entries(offset, data);
354     }
355 
write_pba(&mut self, offset: u64, data: &[u8])356     fn write_pba(&mut self, offset: u64, data: &[u8]) {
357         let offset = offset - self.pba_offset;
358         self.config.write_pba_entries(offset, data);
359     }
360 
get_msix_irqfd(&self, index: usize) -> Option<&Event>361     fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
362         let irqfd = self.config.get_irqfd(index);
363         if let Some(fd) = irqfd {
364             if self.msix_vector_masked(index) {
365                 Some(&self.msix_interrupt_evt[index])
366             } else {
367                 Some(fd)
368             }
369         } else {
370             None
371         }
372     }
373 
get_msix_irqfds(&self) -> Vec<Option<&Event>>374     fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
375         let mut irqfds = Vec::new();
376 
377         for i in 0..self.table_size {
378             irqfds.push(self.get_msix_irqfd(i as usize));
379         }
380 
381         irqfds
382     }
383 
table_size(&self) -> usize384     fn table_size(&self) -> usize {
385         self.table_size.into()
386     }
387 
clone_msix_evt(&self) -> Vec<Event>388     fn clone_msix_evt(&self) -> Vec<Event> {
389         self.msix_interrupt_evt
390             .iter()
391             .map(|irq| irq.try_clone().unwrap())
392             .collect()
393     }
394 
msix_vector_masked(&self, index: usize) -> bool395     fn msix_vector_masked(&self, index: usize) -> bool {
396         !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
397     }
398 
trigger(&mut self, index: usize)399     fn trigger(&mut self, index: usize) {
400         self.config.trigger(index as u16);
401     }
402 
destroy(&mut self)403     fn destroy(&mut self) {
404         self.config.destroy()
405     }
406 }
407 
408 struct VfioResourceAllocator {
409     // The region that is not allocated yet.
410     regions: BTreeSet<AddressRange>,
411 }
412 
413 impl VfioResourceAllocator {
414     // Creates a new `VfioResourceAllocator` for managing VFIO resources.
415     // Can return `Err` if `base` + `size` overflows a u64.
416     //
417     // * `base` - The starting address of the range to manage.
418     // * `size` - The size of the address range in bytes.
new(pool: AddressRange) -> Result<Self, PciDeviceError>419     fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
420         if pool.is_empty() {
421             return Err(PciDeviceError::SizeZero);
422         }
423         let mut regions = BTreeSet::new();
424         regions.insert(pool);
425         Ok(VfioResourceAllocator { regions })
426     }
427 
internal_allocate_from_slot( &mut self, slot: AddressRange, range: AddressRange, ) -> Result<u64, PciDeviceError>428     fn internal_allocate_from_slot(
429         &mut self,
430         slot: AddressRange,
431         range: AddressRange,
432     ) -> Result<u64, PciDeviceError> {
433         let slot_was_present = self.regions.remove(&slot);
434         assert!(slot_was_present);
435 
436         let (before, after) = slot.non_overlapping_ranges(range);
437 
438         if !before.is_empty() {
439             self.regions.insert(before);
440         }
441         if !after.is_empty() {
442             self.regions.insert(after);
443         }
444 
445         Ok(range.start)
446     }
447 
448     // Allocates a range of addresses from the managed region with a minimal alignment.
449     // Overlapping with a previous allocation is _not_ allowed.
450     // Returns allocated address.
allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError>451     fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
452         if size == 0 {
453             return Err(PciDeviceError::SizeZero);
454         }
455         if !alignment.is_power_of_two() {
456             return Err(PciDeviceError::BadAlignment);
457         }
458 
459         // finds first region matching alignment and size.
460         let region = self.regions.iter().find(|range| {
461             match range.start % alignment {
462                 0 => range.start.checked_add(size - 1),
463                 r => range.start.checked_add(size - 1 + alignment - r),
464             }
465             .map_or(false, |end| end <= range.end)
466         });
467 
468         match region {
469             Some(&slot) => {
470                 let start = match slot.start % alignment {
471                     0 => slot.start,
472                     r => slot.start + alignment - r,
473                 };
474                 let end = start + size - 1;
475                 let range = AddressRange::from_start_and_end(start, end);
476 
477                 self.internal_allocate_from_slot(slot, range)
478             }
479             None => Err(PciDeviceError::OutOfSpace),
480         }
481     }
482 
483     // Allocates a range of addresses from the managed region with a required location.
484     // Overlapping with a previous allocation is allowed.
allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError>485     fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
486         if range.is_empty() {
487             return Err(PciDeviceError::SizeZero);
488         }
489 
490         while let Some(&slot) = self
491             .regions
492             .iter()
493             .find(|avail_range| avail_range.overlaps(range))
494         {
495             let _address = self.internal_allocate_from_slot(slot, range)?;
496         }
497         Ok(())
498     }
499 }
500 
501 struct VfioPciWorker {
502     address: PciAddress,
503     sysfs_path: PathBuf,
504     vm_socket: Tube,
505     name: String,
506     pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
507     msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
508 }
509 
510 impl VfioPciWorker {
run( &mut self, req_irq_evt: Event, wakeup_evt: Event, acpi_notify_evt: Event, kill_evt: Event, msix_evt: Vec<Event>, is_in_low_power: Arc<Mutex<bool>>, gpe: Option<u32>, notification_val: Arc<Mutex<Vec<u32>>>, )511     fn run(
512         &mut self,
513         req_irq_evt: Event,
514         wakeup_evt: Event,
515         acpi_notify_evt: Event,
516         kill_evt: Event,
517         msix_evt: Vec<Event>,
518         is_in_low_power: Arc<Mutex<bool>>,
519         gpe: Option<u32>,
520         notification_val: Arc<Mutex<Vec<u32>>>,
521     ) {
522         #[derive(EventToken, Debug)]
523         enum Token {
524             ReqIrq,
525             WakeUp,
526             AcpiNotifyEvent,
527             Kill,
528             MsixIrqi { index: usize },
529         }
530 
531         let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
532             (&req_irq_evt, Token::ReqIrq),
533             (&wakeup_evt, Token::WakeUp),
534             (&acpi_notify_evt, Token::AcpiNotifyEvent),
535             (&kill_evt, Token::Kill),
536         ]) {
537             Ok(pc) => pc,
538             Err(e) => {
539                 error!(
540                     "{} failed creating vfio WaitContext: {}",
541                     self.name.clone(),
542                     e
543                 );
544                 return;
545             }
546         };
547 
548         for (index, msix_int) in msix_evt.iter().enumerate() {
549             wait_ctx
550                 .add(msix_int, Token::MsixIrqi { index })
551                 .expect("Failed to create vfio WaitContext for msix interrupt event")
552         }
553 
554         'wait: loop {
555             let events = match wait_ctx.wait() {
556                 Ok(v) => v,
557                 Err(e) => {
558                     error!("{} failed polling vfio events: {}", self.name.clone(), e);
559                     break;
560                 }
561             };
562 
563             for event in events.iter().filter(|e| e.is_readable) {
564                 match event.token {
565                     Token::MsixIrqi { index } => {
566                         if let Some(msix_cap) = &self.msix_cap {
567                             msix_cap.lock().trigger(index);
568                         }
569                     }
570                     Token::ReqIrq => {
571                         let device = HotPlugDeviceInfo {
572                             device_type: HotPlugDeviceType::EndPoint,
573                             path: self.sysfs_path.clone(),
574                             hp_interrupt: false,
575                         };
576 
577                         let request = VmRequest::HotPlugVfioCommand { device, add: false };
578                         if self.vm_socket.send(&request).is_ok() {
579                             if let Err(e) = self.vm_socket.recv::<VmResponse>() {
580                                 error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
581                             } else {
582                                 break 'wait;
583                             }
584                         }
585                     }
586                     Token::WakeUp => {
587                         let _ = wakeup_evt.wait();
588 
589                         if *is_in_low_power.lock() {
590                             if let Some(pm_cap) = &self.pm_cap {
591                                 if pm_cap.lock().should_trigger_pme() {
592                                     let request =
593                                         VmRequest::PciPme(self.address.pme_requester_id());
594                                     if self.vm_socket.send(&request).is_ok() {
595                                         if let Err(e) = self.vm_socket.recv::<VmResponse>() {
596                                             error!(
597                                                 "{} failed to send PME: {}",
598                                                 self.name.clone(),
599                                                 e
600                                             );
601                                         }
602                                     }
603                                 }
604                             }
605                         }
606                     }
607                     Token::AcpiNotifyEvent => {
608                         if let Some(gpe) = gpe {
609                             if let Ok(val) = base::EventExt::read_count(&acpi_notify_evt) {
610                                 notification_val.lock().push(val as u32);
611                                 let request = VmRequest::Gpe(gpe);
612                                 if self.vm_socket.send(&request).is_ok() {
613                                     if let Err(e) = self.vm_socket.recv::<VmResponse>() {
614                                         error!("{} failed to send GPE: {}", self.name.clone(), e);
615                                     }
616                                 }
617                             } else {
618                                 error!("{} failed to read acpi_notify_evt", self.name.clone());
619                             }
620                         }
621                     }
622                     Token::Kill => break 'wait,
623                 }
624             }
625         }
626     }
627 }
628 
get_next_from_extcap_header(cap_header: u32) -> u32629 fn get_next_from_extcap_header(cap_header: u32) -> u32 {
630     (cap_header >> 20) & 0xffc
631 }
632 
is_skipped_ext_cap(cap_id: u16) -> bool633 fn is_skipped_ext_cap(cap_id: u16) -> bool {
634     matches!(
635         cap_id,
636         // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
637         PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
638     )
639 }
640 
641 enum DeviceData {
642     IntelGfxData { opregion_index: u32 },
643 }
644 
645 /// PCI Express Extended Capabilities information
646 #[derive(Copy, Clone)]
647 struct ExtCap {
648     /// cap offset in Configuration Space
649     offset: u32,
650     /// cap size
651     size: u32,
652     /// next offset, set next non-skipped offset for non-skipped ext cap
653     next: u16,
654     /// whether to be exposed to guest
655     is_skipped: bool,
656 }
657 
658 /// Implements the Vfio Pci device, then a pci device is added into vm
659 pub struct VfioPciDevice {
660     device: Arc<VfioDevice>,
661     config: VfioPciConfig,
662     hotplug: bool,
663     hotplug_bus_number: Option<u8>,
664     preferred_address: PciAddress,
665     pci_address: Option<PciAddress>,
666     interrupt_evt: Option<IrqLevelEvent>,
667     acpi_notification_evt: Option<Event>,
668     mmio_regions: Vec<PciBarConfiguration>,
669     io_regions: Vec<PciBarConfiguration>,
670     pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
671     msi_cap: Option<VfioMsiCap>,
672     msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
673     irq_type: Option<VfioIrqType>,
674     vm_memory_client: VmMemoryClient,
675     device_data: Option<DeviceData>,
676     pm_evt: Option<Event>,
677     is_in_low_power: Arc<Mutex<bool>>,
678     worker_thread: Option<WorkerThread<VfioPciWorker>>,
679     vm_socket_vm: Option<Tube>,
680     sysfs_path: PathBuf,
681     // PCI Express Extended Capabilities
682     ext_caps: Vec<ExtCap>,
683     vcfg_shm_mmap: Option<MemoryMapping>,
684     mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<VmMemoryRegionId>)>,
685     activated: bool,
686     acpi_notifier_val: Arc<Mutex<Vec<u32>>>,
687     gpe: Option<u32>,
688     base_class_code: PciClassCode,
689 }
690 
691 impl VfioPciDevice {
692     /// Constructs a new Vfio Pci device for the give Vfio device
new( sysfs_path: &Path, device: VfioDevice, hotplug: bool, hotplug_bus_number: Option<u8>, guest_address: Option<PciAddress>, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vm_memory_client: VmMemoryClient, vfio_device_socket_vm: Tube, ) -> Result<Self, PciDeviceError>693     pub fn new(
694         sysfs_path: &Path,
695         device: VfioDevice,
696         hotplug: bool,
697         hotplug_bus_number: Option<u8>,
698         guest_address: Option<PciAddress>,
699         vfio_device_socket_msi: Tube,
700         vfio_device_socket_msix: Tube,
701         vm_memory_client: VmMemoryClient,
702         vfio_device_socket_vm: Tube,
703     ) -> Result<Self, PciDeviceError> {
704         let preferred_address = if let Some(bus_num) = hotplug_bus_number {
705             debug!("hotplug bus {}", bus_num);
706             PciAddress {
707                 // Caller specify pcie bus number for hotplug device
708                 bus: bus_num,
709                 // devfn should be 0, otherwise pcie root port couldn't detect it
710                 dev: 0,
711                 func: 0,
712             }
713         } else if let Some(guest_address) = guest_address {
714             debug!("guest PCI address {}", guest_address);
715             guest_address
716         } else {
717             let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
718                 PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
719             })?;
720             debug!("parsed device PCI address {}", addr);
721             addr
722         };
723 
724         let dev = Arc::new(device);
725         let config = VfioPciConfig::new(Arc::clone(&dev));
726         let mut msi_socket = Some(vfio_device_socket_msi);
727         let mut msix_socket = Some(vfio_device_socket_msix);
728         let mut msi_cap: Option<VfioMsiCap> = None;
729         let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
730         let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
731 
732         let mut is_pcie = false;
733         let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
734         let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
735         let device_id: u16 = config.read_config(PCI_DEVICE_ID);
736         let base_class_code = PciClassCode::try_from(config.read_config::<u8>(PCI_BASE_CLASS_CODE))
737             .unwrap_or(PciClassCode::Other);
738 
739         let pci_id = PciId::new(vendor_id, device_id);
740 
741         while cap_next != 0 {
742             let cap_id: u8 = config.read_config(cap_next);
743             if cap_id == PCI_CAP_ID_PM {
744                 pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
745             } else if cap_id == PCI_CAP_ID_MSI {
746                 if let Some(msi_socket) = msi_socket.take() {
747                     msi_cap = Some(VfioMsiCap::new(
748                         &config,
749                         cap_next,
750                         msi_socket,
751                         pci_id.into(),
752                         dev.device_name().to_string(),
753                     ));
754                 }
755             } else if cap_id == PCI_CAP_ID_MSIX {
756                 if let Some(msix_socket) = msix_socket.take() {
757                     msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
758                         &config,
759                         cap_next,
760                         msix_socket,
761                         pci_id.into(),
762                         dev.device_name().to_string(),
763                     ))));
764                 }
765             } else if cap_id == PciCapabilityID::PciExpress as u8 {
766                 is_pcie = true;
767             }
768             let offset = cap_next + PCI_MSI_NEXT_POINTER;
769             cap_next = config.read_config::<u8>(offset).into();
770         }
771 
772         let mut ext_caps: Vec<ExtCap> = Vec::new();
773         if is_pcie {
774             let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
775             while ext_cap_next != 0 {
776                 let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
777                 if ext_cap_config == 0 {
778                     break;
779                 }
780                 ext_caps.push(ExtCap {
781                     offset: ext_cap_next,
782                     // Calculate the size later
783                     size: 0,
784                     // init as the real value
785                     next: get_next_from_extcap_header(ext_cap_config) as u16,
786                     is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
787                 });
788                 ext_cap_next = get_next_from_extcap_header(ext_cap_config);
789             }
790 
791             // Manage extended caps
792             //
793             // Extended capabilities are chained with each pointing to the next, so
794             // we can drop anything other than the head of the chain simply by
795             // modifying the previous next pointer. For the head of the chain, we
796             // can modify the capability ID to something that cannot match a valid
797             // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
798             // supported.
799             //
800             // reverse order by offset
801             ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
802             let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
803             let mut non_skipped_next: u16 = 0;
804             for ext_cap in ext_caps.iter_mut() {
805                 if !ext_cap.is_skipped {
806                     ext_cap.next = non_skipped_next;
807                     non_skipped_next = ext_cap.offset as u16;
808                 } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
809                     ext_cap.next = non_skipped_next;
810                 }
811                 ext_cap.size = next_offset - ext_cap.offset;
812                 next_offset = ext_cap.offset;
813             }
814             // order by offset
815             ext_caps.reverse();
816         }
817 
818         let is_intel_gfx =
819             base_class_code == PciClassCode::DisplayController && vendor_id == PCI_VENDOR_ID_INTEL;
820         let device_data = if is_intel_gfx {
821             Some(DeviceData::IntelGfxData {
822                 opregion_index: u32::max_value(),
823             })
824         } else {
825             None
826         };
827 
828         Ok(VfioPciDevice {
829             device: dev,
830             config,
831             hotplug,
832             hotplug_bus_number,
833             preferred_address,
834             pci_address: None,
835             interrupt_evt: None,
836             acpi_notification_evt: None,
837             mmio_regions: Vec::new(),
838             io_regions: Vec::new(),
839             pm_cap,
840             msi_cap,
841             msix_cap,
842             irq_type: None,
843             vm_memory_client,
844             device_data,
845             pm_evt: None,
846             is_in_low_power: Arc::new(Mutex::new(false)),
847             worker_thread: None,
848             vm_socket_vm: Some(vfio_device_socket_vm),
849             sysfs_path: sysfs_path.to_path_buf(),
850             ext_caps,
851             vcfg_shm_mmap: None,
852             mapped_mmio_bars: BTreeMap::new(),
853             activated: false,
854             acpi_notifier_val: Arc::new(Mutex::new(Vec::new())),
855             gpe: None,
856             base_class_code,
857         })
858     }
859 
860     /// Gets the pci address of the device, if one has already been allocated.
pci_address(&self) -> Option<PciAddress>861     pub fn pci_address(&self) -> Option<PciAddress> {
862         self.pci_address
863     }
864 
is_gfx(&self) -> bool865     pub fn is_gfx(&self) -> bool {
866         self.base_class_code == PciClassCode::DisplayController
867     }
868 
is_intel_gfx(&self) -> bool869     fn is_intel_gfx(&self) -> bool {
870         matches!(self.device_data, Some(DeviceData::IntelGfxData { .. }))
871     }
872 
enable_acpi_notification(&mut self) -> Result<(), PciDeviceError>873     fn enable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
874         if let Some(ref acpi_notification_evt) = self.acpi_notification_evt {
875             return self
876                 .device
877                 .acpi_notification_evt_enable(acpi_notification_evt, VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
878                 .map_err(|_| PciDeviceError::AcpiNotifySetupFailed);
879         }
880         Err(PciDeviceError::AcpiNotifySetupFailed)
881     }
882 
883     #[allow(dead_code)]
disable_acpi_notification(&mut self) -> Result<(), PciDeviceError>884     fn disable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
885         if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
886             return self
887                 .device
888                 .acpi_notification_disable(VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
889                 .map_err(|_| PciDeviceError::AcpiNotifyDeactivationFailed);
890         }
891         Err(PciDeviceError::AcpiNotifyDeactivationFailed)
892     }
893 
894     #[allow(dead_code)]
test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError>895     fn test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError> {
896         if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
897             return self
898                 .device
899                 .acpi_notification_test(VFIO_PCI_ACPI_NTFY_IRQ_INDEX, val)
900                 .map_err(|_| PciDeviceError::AcpiNotifyTestFailed);
901         }
902         Err(PciDeviceError::AcpiNotifyTestFailed)
903     }
904 
enable_intx(&mut self)905     fn enable_intx(&mut self) {
906         if let Some(ref interrupt_evt) = self.interrupt_evt {
907             if let Err(e) = self.device.irq_enable(
908                 &[Some(interrupt_evt.get_trigger())],
909                 VFIO_PCI_INTX_IRQ_INDEX,
910                 0,
911             ) {
912                 error!("{} Intx enable failed: {}", self.debug_label(), e);
913                 return;
914             }
915             if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
916                 error!("{} Intx mask failed: {}", self.debug_label(), e);
917                 self.disable_intx();
918                 return;
919             }
920             if let Err(e) = self
921                 .device
922                 .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
923             {
924                 error!("{} resample enable failed: {}", self.debug_label(), e);
925                 self.disable_intx();
926                 return;
927             }
928             if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
929                 error!("{} Intx unmask failed: {}", self.debug_label(), e);
930                 self.disable_intx();
931                 return;
932             }
933             self.irq_type = Some(VfioIrqType::Intx);
934         }
935     }
936 
disable_intx(&mut self)937     fn disable_intx(&mut self) {
938         if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
939             error!("{} Intx disable failed: {}", self.debug_label(), e);
940         }
941         self.irq_type = None;
942     }
943 
disable_irqs(&mut self)944     fn disable_irqs(&mut self) {
945         match self.irq_type {
946             Some(VfioIrqType::Msi) => self.disable_msi(),
947             Some(VfioIrqType::Msix) => self.disable_msix(),
948             _ => (),
949         }
950 
951         // Above disable_msi() or disable_msix() will enable intx again.
952         // so disable_intx here again.
953         if let Some(VfioIrqType::Intx) = self.irq_type {
954             self.disable_intx();
955         }
956     }
957 
enable_msi(&mut self)958     fn enable_msi(&mut self) {
959         self.disable_irqs();
960 
961         let irqfd = match &self.msi_cap {
962             Some(cap) => {
963                 if let Some(fd) = cap.get_msi_irqfd() {
964                     fd
965                 } else {
966                     self.enable_intx();
967                     return;
968                 }
969             }
970             None => {
971                 self.enable_intx();
972                 return;
973             }
974         };
975 
976         if let Err(e) = self
977             .device
978             .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
979         {
980             error!("{} failed to enable msi: {}", self.debug_label(), e);
981             self.enable_intx();
982             return;
983         }
984 
985         self.irq_type = Some(VfioIrqType::Msi);
986     }
987 
disable_msi(&mut self)988     fn disable_msi(&mut self) {
989         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
990             error!("{} failed to disable msi: {}", self.debug_label(), e);
991             return;
992         }
993         self.irq_type = None;
994 
995         self.enable_intx();
996     }
997 
enable_msix(&mut self)998     fn enable_msix(&mut self) {
999         if self.msix_cap.is_none() {
1000             return;
1001         }
1002 
1003         self.disable_irqs();
1004         let cap = self.msix_cap.as_ref().unwrap().lock();
1005         let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1006 
1007         let mut failed = false;
1008         if !vector_in_use {
1009             // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1010             // to vector 0. Then we enable it and immediately disable it, so that vfio will
1011             // activate physical device. If there are available msix vectors, just enable them
1012             // instead.
1013             let fd = Event::new().expect("failed to create event");
1014             let table_size = cap.table_size();
1015             let mut irqfds = vec![None; table_size];
1016             irqfds[0] = Some(&fd);
1017             for fd in irqfds.iter_mut().skip(1) {
1018                 *fd = None;
1019             }
1020             if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1021                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1022                 failed = true;
1023             }
1024             irqfds[0] = None;
1025             if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1026                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1027                 failed = true;
1028             }
1029         } else {
1030             let result = self
1031                 .device
1032                 .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1033             if let Err(e) = result {
1034                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1035                 failed = true;
1036             }
1037         }
1038 
1039         std::mem::drop(cap);
1040         if failed {
1041             self.enable_intx();
1042             return;
1043         }
1044         self.irq_type = Some(VfioIrqType::Msix);
1045     }
1046 
disable_msix(&mut self)1047     fn disable_msix(&mut self) {
1048         if self.msix_cap.is_none() {
1049             return;
1050         }
1051         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1052             error!("{} failed to disable msix: {}", self.debug_label(), e);
1053             return;
1054         }
1055         self.irq_type = None;
1056         self.enable_intx();
1057     }
1058 
msix_vectors_update(&self) -> Result<(), VfioError>1059     fn msix_vectors_update(&self) -> Result<(), VfioError> {
1060         if let Some(cap) = &self.msix_cap {
1061             self.device
1062                 .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1063         }
1064         Ok(())
1065     }
1066 
msix_vector_update(&self, index: usize, irqfd: Option<&Event>)1067     fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1068         if let Err(e) = self
1069             .device
1070             .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1071         {
1072             error!(
1073                 "{} failed to update msix vector {}: {}",
1074                 self.debug_label(),
1075                 index,
1076                 e
1077             );
1078         }
1079     }
1080 
adjust_bar_mmap( &self, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, remove_mmaps: &[AddressRange], ) -> Vec<vfio_region_sparse_mmap_area>1081     fn adjust_bar_mmap(
1082         &self,
1083         bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1084         remove_mmaps: &[AddressRange],
1085     ) -> Vec<vfio_region_sparse_mmap_area> {
1086         let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1087         let pgmask = (pagesize() as u64) - 1;
1088 
1089         for mmap in bar_mmaps.iter() {
1090             let mmap_range = if let Some(mmap_range) =
1091                 AddressRange::from_start_and_size(mmap.offset, mmap.size)
1092             {
1093                 mmap_range
1094             } else {
1095                 continue;
1096             };
1097             let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1098                 Ok(a) => a,
1099                 Err(e) => {
1100                     error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1101                     mmaps.clear();
1102                     return mmaps;
1103                 }
1104             };
1105 
1106             for &(mut remove_range) in remove_mmaps.iter() {
1107                 remove_range = remove_range.intersect(mmap_range);
1108                 if !remove_range.is_empty() {
1109                     // align offsets to page size
1110                     let begin = remove_range.start & !pgmask;
1111                     let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1112                     let remove_range = AddressRange::from_start_and_end(begin, end);
1113                     if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1114                         error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1115                     }
1116                 }
1117             }
1118 
1119             for mmap in to_mmap.regions {
1120                 mmaps.push(vfio_region_sparse_mmap_area {
1121                     offset: mmap.start,
1122                     size: mmap.end - mmap.start + 1,
1123                 });
1124             }
1125         }
1126 
1127         mmaps
1128     }
1129 
remove_bar_mmap_msix( &self, bar_index: PciBarIndex, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1130     fn remove_bar_mmap_msix(
1131         &self,
1132         bar_index: PciBarIndex,
1133         bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1134     ) -> Vec<vfio_region_sparse_mmap_area> {
1135         let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1136         let mut msix_regions = Vec::new();
1137 
1138         if let Some(t) = msix_cap.get_msix_table(bar_index) {
1139             msix_regions.push(t);
1140         }
1141         if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1142             msix_regions.push(p);
1143         }
1144 
1145         if msix_regions.is_empty() {
1146             return bar_mmaps;
1147         }
1148 
1149         self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1150     }
1151 
add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId>1152     fn add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId> {
1153         let mut mmaps_ids: Vec<VmMemoryRegionId> = Vec::new();
1154         if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1155             // the bar storing msix table and pba couldn't mmap.
1156             // these bars should be trapped, so that msix could be emulated.
1157             let mut mmaps = self.device.get_region_mmap(index);
1158 
1159             if self.msix_cap.is_some() {
1160                 mmaps = self.remove_bar_mmap_msix(index, mmaps);
1161             }
1162             if mmaps.is_empty() {
1163                 return mmaps_ids;
1164             }
1165 
1166             for mmap in mmaps.iter() {
1167                 let mmap_offset = mmap.offset;
1168                 let mmap_size = mmap.size;
1169                 let guest_map_start = bar_addr + mmap_offset;
1170                 let region_offset = self.device.get_region_offset(index);
1171                 let offset = region_offset + mmap_offset;
1172                 let descriptor = match self.device.device_file().try_clone() {
1173                     Ok(device_file) => device_file.into(),
1174                     Err(_) => break,
1175                 };
1176                 match self.vm_memory_client.register_memory(
1177                     VmMemorySource::Descriptor {
1178                         descriptor,
1179                         offset,
1180                         size: mmap_size,
1181                     },
1182                     VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1183                     Protection::read_write(),
1184                     MemCacheType::CacheCoherent,
1185                 ) {
1186                     Ok(id) => {
1187                         mmaps_ids.push(id);
1188                     }
1189                     Err(e) => {
1190                         error!("register_memory failed: {}", e);
1191                         break;
1192                     }
1193                 }
1194             }
1195         }
1196 
1197         mmaps_ids
1198     }
1199 
remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId])1200     fn remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId]) {
1201         for mmap_id in mmap_ids {
1202             if let Err(e) = self.vm_memory_client.unregister_memory(*mmap_id) {
1203                 error!("unregister_memory failed: {}", e);
1204             }
1205         }
1206     }
1207 
disable_bars_mmap(&mut self)1208     fn disable_bars_mmap(&mut self) {
1209         for (_, (_, mmap_ids)) in self.mapped_mmio_bars.iter() {
1210             self.remove_bar_mmap(mmap_ids);
1211         }
1212         self.mapped_mmio_bars.clear();
1213     }
1214 
commit_bars_mmap(&mut self)1215     fn commit_bars_mmap(&mut self) {
1216         // Unmap all bars before remapping bars, to prevent issues with overlap
1217         let mut needs_map = Vec::new();
1218         for mmio_info in self.mmio_regions.iter() {
1219             let bar_idx = mmio_info.bar_index();
1220             let addr = mmio_info.address();
1221 
1222             if let Some((cur_addr, ids)) = self.mapped_mmio_bars.remove(&bar_idx) {
1223                 if cur_addr == addr {
1224                     self.mapped_mmio_bars.insert(bar_idx, (cur_addr, ids));
1225                     continue;
1226                 } else {
1227                     self.remove_bar_mmap(&ids);
1228                 }
1229             }
1230 
1231             if addr != 0 {
1232                 needs_map.push((bar_idx, addr));
1233             }
1234         }
1235 
1236         for (bar_idx, addr) in needs_map.iter() {
1237             let ids = self.add_bar_mmap(*bar_idx, *addr);
1238             self.mapped_mmio_bars.insert(*bar_idx, (*addr, ids));
1239         }
1240     }
1241 
close(&mut self)1242     fn close(&mut self) {
1243         if let Some(msi) = self.msi_cap.as_mut() {
1244             msi.destroy();
1245         }
1246         if let Some(msix) = &self.msix_cap {
1247             msix.lock().destroy();
1248         }
1249         self.disable_bars_mmap();
1250         self.device.close();
1251     }
1252 
start_work_thread(&mut self)1253     fn start_work_thread(&mut self) {
1254         let vm_socket = match self.vm_socket_vm.take() {
1255             Some(socket) => socket,
1256             None => return,
1257         };
1258 
1259         let req_evt = match Event::new() {
1260             Ok(evt) => {
1261                 if let Err(e) = self
1262                     .device
1263                     .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1264                 {
1265                     error!("{} enable req_irq failed: {}", self.debug_label(), e);
1266                     return;
1267                 }
1268                 evt
1269             }
1270             Err(_) => return,
1271         };
1272 
1273         let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1274             Ok(v) => v,
1275             Err(e) => {
1276                 error!(
1277                     "{} failed creating PM Event pair: {}",
1278                     self.debug_label(),
1279                     e
1280                 );
1281                 return;
1282             }
1283         };
1284         self.pm_evt = Some(self_pm_evt);
1285 
1286         let (self_acpi_notify_evt, acpi_notify_evt) =
1287             match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1288                 Ok(v) => v,
1289                 Err(e) => {
1290                     error!(
1291                         "{} failed creating ACPI Event pair: {}",
1292                         self.debug_label(),
1293                         e
1294                     );
1295                     return;
1296                 }
1297             };
1298         self.acpi_notification_evt = Some(self_acpi_notify_evt);
1299 
1300         if let Err(e) = self.enable_acpi_notification() {
1301             error!("{}: {}", self.debug_label(), e);
1302         }
1303 
1304         let mut msix_evt = Vec::new();
1305         if let Some(msix_cap) = &self.msix_cap {
1306             msix_evt = msix_cap.lock().clone_msix_evt();
1307         }
1308 
1309         let name = self.device.device_name().to_string();
1310         let address = self.pci_address.expect("Unassigned PCI Address.");
1311         let sysfs_path = self.sysfs_path.clone();
1312         let pm_cap = self.pm_cap.clone();
1313         let msix_cap = self.msix_cap.clone();
1314         let is_in_low_power = self.is_in_low_power.clone();
1315         let gpe_nr = self.gpe;
1316         let notification_val = self.acpi_notifier_val.clone();
1317         self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1318             let mut worker = VfioPciWorker {
1319                 address,
1320                 sysfs_path,
1321                 vm_socket,
1322                 name,
1323                 pm_cap,
1324                 msix_cap,
1325             };
1326             worker.run(
1327                 req_evt,
1328                 pm_evt,
1329                 acpi_notify_evt,
1330                 kill_evt,
1331                 msix_evt,
1332                 is_in_low_power,
1333                 gpe_nr,
1334                 notification_val,
1335             );
1336             worker
1337         }));
1338         self.activated = true;
1339     }
1340 
collect_bars(&mut self) -> Vec<PciBarConfiguration>1341     fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1342         let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1343         let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1344 
1345         while i <= VFIO_PCI_ROM_REGION_INDEX {
1346             let mut low: u32 = 0xffffffff;
1347             let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1348                 0x30
1349             } else {
1350                 0x10 + i * 4
1351             };
1352             self.config.write_config(low, offset);
1353             low = self.config.read_config(offset);
1354 
1355             let low_flag = low & 0xf;
1356             let is_64bit = low_flag & 0x4 == 0x4;
1357             if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1358                 let mut upper: u32 = 0xffffffff;
1359                 if is_64bit {
1360                     self.config.write_config(upper, offset + 4);
1361                     upper = self.config.read_config(offset + 4);
1362                 }
1363 
1364                 low &= 0xffff_fff0;
1365                 let mut size: u64 = u64::from(upper);
1366                 size <<= 32;
1367                 size |= u64::from(low);
1368                 size = !size + 1;
1369                 let region_type = if is_64bit {
1370                     PciBarRegionType::Memory64BitRegion
1371                 } else {
1372                     PciBarRegionType::Memory32BitRegion
1373                 };
1374                 let prefetch = if low_flag & 0x8 == 0x8 {
1375                     PciBarPrefetchable::Prefetchable
1376                 } else {
1377                     PciBarPrefetchable::NotPrefetchable
1378                 };
1379                 mem_bars.push(PciBarConfiguration::new(
1380                     i as usize,
1381                     size,
1382                     region_type,
1383                     prefetch,
1384                 ));
1385             } else if low_flag & 0x1 == 0x1 {
1386                 let size = !(low & 0xffff_fffc) + 1;
1387                 self.io_regions.push(PciBarConfiguration::new(
1388                     i as usize,
1389                     size.into(),
1390                     PciBarRegionType::IoRegion,
1391                     PciBarPrefetchable::NotPrefetchable,
1392                 ));
1393             }
1394 
1395             if is_64bit {
1396                 i += 2;
1397             } else {
1398                 i += 1;
1399             }
1400         }
1401         mem_bars
1402     }
1403 
configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64)1404     fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1405         let offset: u32 = bar_info.reg_index() as u32 * 4;
1406         let mmio_region = *bar_info;
1407         self.mmio_regions.push(mmio_region.set_address(bar_addr));
1408 
1409         let val: u32 = self.config.read_config(offset);
1410         let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1411         self.config.write_config(low, offset);
1412         if bar_info.is_64bit_memory() {
1413             let upper = (bar_addr >> 32) as u32;
1414             self.config.write_config(upper, offset + 4);
1415         }
1416     }
1417 
allocate_root_barmem( &mut self, mem_bars: &[PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1418     fn allocate_root_barmem(
1419         &mut self,
1420         mem_bars: &[PciBarConfiguration],
1421         resources: &mut SystemAllocator,
1422     ) -> Result<Vec<BarRange>, PciDeviceError> {
1423         let address = self.pci_address.unwrap();
1424         let mut ranges: Vec<BarRange> = Vec::new();
1425         for mem_bar in mem_bars {
1426             let bar_size = mem_bar.size();
1427             let mut bar_addr: u64 = 0;
1428             // Don't allocate mmio for hotplug device, OS will allocate it from
1429             // its parent's bridge window.
1430             if !self.hotplug {
1431                 bar_addr = resources
1432                     .allocate_mmio(
1433                         bar_size,
1434                         Alloc::PciBar {
1435                             bus: address.bus,
1436                             dev: address.dev,
1437                             func: address.func,
1438                             bar: mem_bar.bar_index() as u8,
1439                         },
1440                         "vfio_bar".to_string(),
1441                         AllocOptions::new()
1442                             .prefetchable(mem_bar.is_prefetchable())
1443                             .max_address(if mem_bar.is_64bit_memory() {
1444                                 u64::MAX
1445                             } else {
1446                                 u32::MAX.into()
1447                             })
1448                             .align(bar_size),
1449                     )
1450                     .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1451                 ranges.push(BarRange {
1452                     addr: bar_addr,
1453                     size: bar_size,
1454                     prefetchable: mem_bar.is_prefetchable(),
1455                 });
1456             }
1457             self.configure_barmem(mem_bar, bar_addr);
1458         }
1459         Ok(ranges)
1460     }
1461 
allocate_nonroot_barmem( &mut self, mem_bars: &mut [PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1462     fn allocate_nonroot_barmem(
1463         &mut self,
1464         mem_bars: &mut [PciBarConfiguration],
1465         resources: &mut SystemAllocator,
1466     ) -> Result<Vec<BarRange>, PciDeviceError> {
1467         const NON_PREFETCHABLE: usize = 0;
1468         const PREFETCHABLE: usize = 1;
1469         const ARRAY_SIZE: usize = 2;
1470         let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1471         let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1472             match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1473                 Ok(a) => a,
1474                 Err(e) => {
1475                     error!(
1476                         "{} init nonroot VfioResourceAllocator failed: {}",
1477                         self.debug_label(),
1478                         e
1479                     );
1480                     return Err(e);
1481                 }
1482             },
1483             match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1484                 Ok(a) => a,
1485                 Err(e) => {
1486                     error!(
1487                         "{} init nonroot VfioResourceAllocator failed: {}",
1488                         self.debug_label(),
1489                         e
1490                     );
1491                     return Err(e);
1492                 }
1493             },
1494         ];
1495         let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1496         // the window must be 1M-aligned as per the PCI spec
1497         let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1498         let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1499 
1500         // Descend by bar size, this could reduce allocated size for all the bars.
1501         mem_bars.sort_by_key(|a| Reverse(a.size()));
1502         for mem_bar in mem_bars {
1503             let prefetchable = mem_bar.is_prefetchable();
1504             let is_64bit = mem_bar.is_64bit_memory();
1505 
1506             // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1507             // as all the prefetchable bars should be in one region
1508             if prefetchable && !is_64bit {
1509                 memtype[PREFETCHABLE] = MmioType::Low;
1510             }
1511             let i = if prefetchable {
1512                 PREFETCHABLE
1513             } else {
1514                 NON_PREFETCHABLE
1515             };
1516             let bar_size = mem_bar.size();
1517             let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1518                 Ok(s) => s,
1519                 Err(e) => {
1520                     error!(
1521                         "{} nonroot allocate_wit_align failed: {}",
1522                         self.debug_label(),
1523                         e
1524                     );
1525                     return Err(e);
1526                 }
1527             };
1528             window_sz[i] = max(window_sz[i], start + bar_size);
1529             alignment[i] = max(alignment[i], bar_size);
1530             let mem_info = (*mem_bar).set_address(start);
1531             membars[i].push(mem_info);
1532         }
1533 
1534         let address = self.pci_address.unwrap();
1535         let mut ranges: Vec<BarRange> = Vec::new();
1536         for (index, bars) in membars.iter().enumerate() {
1537             if bars.is_empty() {
1538                 continue;
1539             }
1540 
1541             let i = if index == 1 {
1542                 PREFETCHABLE
1543             } else {
1544                 NON_PREFETCHABLE
1545             };
1546             let mut window_addr: u64 = 0;
1547             // Don't allocate mmio for hotplug device, OS will allocate it from
1548             // its parent's bridge window.
1549             if !self.hotplug {
1550                 window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1551                 let alloc = if i == NON_PREFETCHABLE {
1552                     Alloc::PciBridgeWindow {
1553                         bus: address.bus,
1554                         dev: address.dev,
1555                         func: address.func,
1556                     }
1557                 } else {
1558                     Alloc::PciBridgePrefetchWindow {
1559                         bus: address.bus,
1560                         dev: address.dev,
1561                         func: address.func,
1562                     }
1563                 };
1564                 window_addr = resources
1565                     .mmio_allocator(memtype[i])
1566                     .allocate_with_align(
1567                         window_sz[i],
1568                         alloc,
1569                         "vfio_bar_window".to_string(),
1570                         alignment[i],
1571                     )
1572                     .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1573                 for mem_info in bars {
1574                     let bar_addr = window_addr + mem_info.address();
1575                     ranges.push(BarRange {
1576                         addr: bar_addr,
1577                         size: mem_info.size(),
1578                         prefetchable: mem_info.is_prefetchable(),
1579                     });
1580                 }
1581             }
1582 
1583             for mem_info in bars {
1584                 let bar_addr = window_addr + mem_info.address();
1585                 self.configure_barmem(mem_info, bar_addr);
1586             }
1587         }
1588         Ok(ranges)
1589     }
1590 
1591     /// Return the supported iova max address of the Vfio Pci device
get_max_iova(&self) -> u641592     pub fn get_max_iova(&self) -> u64 {
1593         self.device.get_max_addr()
1594     }
1595 
get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap>1596     fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1597         self.ext_caps
1598             .iter()
1599             .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1600             .cloned()
1601     }
1602 
is_skipped_reg(&self, reg: u32) -> bool1603     fn is_skipped_reg(&self, reg: u32) -> bool {
1604         // fast handle for pci config space
1605         if reg < PCI_CONFIG_SPACE_SIZE {
1606             return false;
1607         }
1608 
1609         self.get_ext_cap_by_reg(reg)
1610             .map_or(false, |cap| cap.is_skipped)
1611     }
1612 }
1613 
1614 impl PciDevice for VfioPciDevice {
debug_label(&self) -> String1615     fn debug_label(&self) -> String {
1616         format!("vfio {} device", self.device.device_name())
1617     }
1618 
preferred_address(&self) -> Option<PciAddress>1619     fn preferred_address(&self) -> Option<PciAddress> {
1620         Some(self.preferred_address)
1621     }
1622 
allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>1623     fn allocate_address(
1624         &mut self,
1625         resources: &mut SystemAllocator,
1626     ) -> Result<PciAddress, PciDeviceError> {
1627         if self.pci_address.is_none() {
1628             let mut address = self.preferred_address;
1629             while address.func < 8 {
1630                 if resources.reserve_pci(
1631                     Alloc::PciBar {
1632                         bus: address.bus,
1633                         dev: address.dev,
1634                         func: address.func,
1635                         bar: 0,
1636                     },
1637                     self.debug_label(),
1638                 ) {
1639                     self.pci_address = Some(address);
1640                     break;
1641                 } else if self.hotplug_bus_number.is_none() {
1642                     break;
1643                 } else {
1644                     address.func += 1;
1645                 }
1646             }
1647         }
1648         self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1649     }
1650 
keep_rds(&self) -> Vec<RawDescriptor>1651     fn keep_rds(&self) -> Vec<RawDescriptor> {
1652         let mut rds = self.device.keep_rds();
1653         if let Some(ref interrupt_evt) = self.interrupt_evt {
1654             rds.extend(interrupt_evt.as_raw_descriptors());
1655         }
1656         rds.push(self.vm_memory_client.as_raw_descriptor());
1657         if let Some(vm_socket_vm) = &self.vm_socket_vm {
1658             rds.push(vm_socket_vm.as_raw_descriptor());
1659         }
1660         if let Some(msi_cap) = &self.msi_cap {
1661             rds.push(msi_cap.config.get_msi_socket());
1662         }
1663         if let Some(msix_cap) = &self.msix_cap {
1664             rds.push(msix_cap.lock().config.as_raw_descriptor());
1665         }
1666         rds
1667     }
1668 
preferred_irq(&self) -> PreferredIrq1669     fn preferred_irq(&self) -> PreferredIrq {
1670         // Is INTx configured?
1671         let pin = match self.config.read_config::<u8>(PCI_INTERRUPT_PIN) {
1672             1 => PciInterruptPin::IntA,
1673             2 => PciInterruptPin::IntB,
1674             3 => PciInterruptPin::IntC,
1675             4 => PciInterruptPin::IntD,
1676             _ => return PreferredIrq::None,
1677         };
1678 
1679         // TODO: replace sysfs/irq value parsing with vfio interface
1680         //       reporting host allocated interrupt number and type.
1681         let path = self.sysfs_path.join("irq");
1682         let gsi = fs::read_to_string(path)
1683             .map(|v| v.trim().parse::<u32>().unwrap_or(0))
1684             .unwrap_or(0);
1685 
1686         PreferredIrq::Fixed { pin, gsi }
1687     }
1688 
assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32)1689     fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1690         // Keep event/resample event references.
1691         self.interrupt_evt = Some(irq_evt);
1692 
1693         // enable INTX
1694         self.enable_intx();
1695 
1696         self.config
1697             .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1698         self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1699     }
1700 
allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1701     fn allocate_io_bars(
1702         &mut self,
1703         resources: &mut SystemAllocator,
1704     ) -> Result<Vec<BarRange>, PciDeviceError> {
1705         let address = self
1706             .pci_address
1707             .expect("allocate_address must be called prior to allocate_device_bars");
1708 
1709         let mut mem_bars = self.collect_bars();
1710 
1711         let ranges = if address.bus == 0 {
1712             self.allocate_root_barmem(&mem_bars, resources)?
1713         } else {
1714             self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1715         };
1716 
1717         // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1718         // driver doesn't claim this vga device, then xorg couldn't boot up.
1719         if self.is_intel_gfx() {
1720             let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1721             cmd |= PCI_COMMAND_MEMORY;
1722             self.config.write_config(cmd, PCI_COMMAND);
1723         }
1724         Ok(ranges)
1725     }
1726 
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1727     fn allocate_device_bars(
1728         &mut self,
1729         resources: &mut SystemAllocator,
1730     ) -> Result<Vec<BarRange>, PciDeviceError> {
1731         let mut ranges: Vec<BarRange> = Vec::new();
1732 
1733         if !self.is_intel_gfx() {
1734             return Ok(ranges);
1735         }
1736 
1737         // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1738         // then write this gpa into pci cfg register
1739         if let Some((index, size)) = self.device.get_cap_type_info(
1740             VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1741             VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1742         ) {
1743             let address = self
1744                 .pci_address
1745                 .expect("allocate_address must be called prior to allocate_device_bars");
1746             let bar_addr = resources
1747                 .allocate_mmio(
1748                     size,
1749                     Alloc::PciBar {
1750                         bus: address.bus,
1751                         dev: address.dev,
1752                         func: address.func,
1753                         bar: (index * 4) as u8,
1754                     },
1755                     "vfio_bar".to_string(),
1756                     AllocOptions::new().max_address(u32::MAX.into()),
1757                 )
1758                 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1759             ranges.push(BarRange {
1760                 addr: bar_addr,
1761                 size,
1762                 prefetchable: false,
1763             });
1764             self.device_data = Some(DeviceData::IntelGfxData {
1765                 opregion_index: index,
1766             });
1767 
1768             self.mmio_regions.push(
1769                 PciBarConfiguration::new(
1770                     index as usize,
1771                     size,
1772                     PciBarRegionType::Memory32BitRegion,
1773                     PciBarPrefetchable::NotPrefetchable,
1774                 )
1775                 .set_address(bar_addr),
1776             );
1777             self.config.write_config(bar_addr as u32, 0xFC);
1778         }
1779 
1780         Ok(ranges)
1781     }
1782 
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1783     fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1784         for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1785             if region.bar_index() == bar_num {
1786                 let command: u8 = self.config.read_config(PCI_COMMAND);
1787                 if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1788                     return None;
1789                 } else {
1790                     return Some(*region);
1791                 }
1792             }
1793         }
1794 
1795         None
1796     }
1797 
register_device_capabilities(&mut self) -> Result<(), PciDeviceError>1798     fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1799         Ok(())
1800     }
1801 
read_config_register(&self, reg_idx: usize) -> u321802     fn read_config_register(&self, reg_idx: usize) -> u32 {
1803         let reg: u32 = (reg_idx * 4) as u32;
1804         let mut config: u32 = self.config.read_config(reg);
1805 
1806         // See VfioPciDevice::new for details how extended caps are managed
1807         if reg >= PCI_CONFIG_SPACE_SIZE {
1808             let ext_cap = self.get_ext_cap_by_reg(reg);
1809             if let Some(ext_cap) = ext_cap {
1810                 if ext_cap.offset == reg {
1811                     config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1812                 }
1813 
1814                 if ext_cap.is_skipped {
1815                     if reg == PCI_CONFIG_SPACE_SIZE {
1816                         config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1817                     } else {
1818                         config = 0;
1819                     }
1820                 }
1821             }
1822         }
1823 
1824         // Ignore IO bar
1825         if (0x10..=0x24).contains(&reg) {
1826             let bar_idx = (reg as usize - 0x10) / 4;
1827             if let Some(bar) = self.get_bar_configuration(bar_idx) {
1828                 if bar.is_io() {
1829                     config = 0;
1830                 }
1831             }
1832         } else if let Some(msix_cap) = &self.msix_cap {
1833             let msix_cap = msix_cap.lock();
1834             if msix_cap.is_msix_control_reg(reg, 4) {
1835                 msix_cap.read_msix_control(&mut config);
1836             }
1837         } else if let Some(pm_cap) = &self.pm_cap {
1838             let pm_cap = pm_cap.lock();
1839             if pm_cap.is_pm_reg(reg) {
1840                 config = pm_cap.read(reg);
1841             }
1842         }
1843 
1844         // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1845         if self.is_intel_gfx() && reg == 0x50 {
1846             config &= 0xffff00ff;
1847         }
1848 
1849         config
1850     }
1851 
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1852     fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1853         // When guest write config register at the first time, start worker thread
1854         if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1855             self.start_work_thread();
1856         };
1857 
1858         let start = (reg_idx * 4) as u64 + offset;
1859 
1860         if let Some(pm_cap) = self.pm_cap.as_mut() {
1861             let mut pm_cap = pm_cap.lock();
1862             if pm_cap.is_pm_reg(start as u32) {
1863                 pm_cap.write(start, data);
1864             }
1865         }
1866 
1867         let mut msi_change: Option<VfioMsiChange> = None;
1868         if let Some(msi_cap) = self.msi_cap.as_mut() {
1869             if msi_cap.is_msi_reg(start, data.len()) {
1870                 msi_change = msi_cap.write_msi_reg(start, data);
1871             }
1872         }
1873 
1874         match msi_change {
1875             Some(VfioMsiChange::Enable) => self.enable_msi(),
1876             Some(VfioMsiChange::Disable) => self.disable_msi(),
1877             _ => (),
1878         }
1879 
1880         msi_change = None;
1881         if let Some(msix_cap) = &self.msix_cap {
1882             let mut msix_cap = msix_cap.lock();
1883             if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
1884                 msi_change = msix_cap.write_msix_control(data);
1885             }
1886         }
1887 
1888         match msi_change {
1889             Some(VfioMsiChange::Enable) => self.enable_msix(),
1890             Some(VfioMsiChange::Disable) => self.disable_msix(),
1891             Some(VfioMsiChange::FunctionChanged) => {
1892                 if let Err(e) = self.msix_vectors_update() {
1893                     error!("update msix vectors failed: {}", e);
1894                 }
1895             }
1896             _ => (),
1897         }
1898 
1899         if !self.is_skipped_reg(start as u32) {
1900             self.device
1901                 .region_write(VFIO_PCI_CONFIG_REGION_INDEX as usize, data, start);
1902         }
1903 
1904         // if guest enable memory access, then enable bar mappable once
1905         if start == PCI_COMMAND as u64
1906             && data.len() == 2
1907             && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
1908         {
1909             self.commit_bars_mmap();
1910         } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
1911             let bar_idx = (start as u32 - 0x10) / 4;
1912             let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
1913             let val = u32::from_le_bytes(value);
1914             let mut modify = false;
1915             for region in self.mmio_regions.iter_mut() {
1916                 if region.bar_index() == bar_idx as usize {
1917                     let old_addr = region.address();
1918                     let new_addr = val & 0xFFFFFFF0;
1919                     if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
1920                         // Change 32bit bar address
1921                         *region = region.set_address(u64::from(new_addr));
1922                         modify = true;
1923                     } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
1924                         // Change 64bit bar low address
1925                         *region =
1926                             region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
1927                         modify = true;
1928                     }
1929                     break;
1930                 } else if region.is_64bit_memory()
1931                     && ((bar_idx % 2) == 1)
1932                     && (region.bar_index() + 1 == bar_idx as usize)
1933                 {
1934                     // Change 64bit bar high address
1935                     let old_addr = region.address();
1936                     if val != (old_addr >> 32) as u32 {
1937                         let mut new_addr = (u64::from(val)) << 32;
1938                         new_addr |= old_addr & 0xFFFFFFFF;
1939                         *region = region.set_address(new_addr);
1940                         modify = true;
1941                     }
1942                     break;
1943                 }
1944             }
1945             if modify {
1946                 // if bar is changed under memory enabled, mmap the
1947                 // new bar immediately.
1948                 let cmd = self.config.read_config::<u8>(PCI_COMMAND);
1949                 if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
1950                     self.commit_bars_mmap();
1951                 }
1952             }
1953         }
1954     }
1955 
read_virtual_config_register(&self, reg_idx: usize) -> u321956     fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
1957         if reg_idx == PCI_VCFG_NOTY {
1958             let mut q = self.acpi_notifier_val.lock();
1959             let mut val = 0;
1960             if !q.is_empty() {
1961                 val = q.remove(0);
1962             }
1963             drop(q);
1964             return val;
1965         }
1966 
1967         warn!(
1968             "{} read unsupported vcfg register {}",
1969             self.debug_label(),
1970             reg_idx
1971         );
1972         0xFFFF_FFFF
1973     }
1974 
write_virtual_config_register(&mut self, reg_idx: usize, value: u32)1975     fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
1976         match reg_idx {
1977             PCI_VCFG_PM => {
1978                 match value {
1979                     0 => {
1980                         if let Some(pm_evt) =
1981                             self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
1982                         {
1983                             *self.is_in_low_power.lock() = true;
1984                             let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
1985                         } else {
1986                             let _ = self.device.pm_low_power_enter();
1987                         }
1988                     }
1989                     _ => {
1990                         *self.is_in_low_power.lock() = false;
1991                         let _ = self.device.pm_low_power_exit();
1992                     }
1993                 };
1994             }
1995             PCI_VCFG_DSM => {
1996                 if let Some(shm) = &self.vcfg_shm_mmap {
1997                     let mut args = [0u8; 4096];
1998                     if let Err(e) = shm.read_slice(&mut args, 0) {
1999                         error!("failed to read DSM Args: {}", e);
2000                         return;
2001                     }
2002                     let res = match self.device.acpi_dsm(&args) {
2003                         Ok(r) => r,
2004                         Err(e) => {
2005                             error!("failed to call DSM: {}", e);
2006                             return;
2007                         }
2008                     };
2009                     if let Err(e) = shm.write_slice(&res, 0) {
2010                         error!("failed to write DSM result: {}", e);
2011                         return;
2012                     }
2013                     if let Err(e) = shm.msync() {
2014                         error!("failed to msync: {}", e)
2015                     }
2016                 }
2017             }
2018             _ => warn!(
2019                 "{} write unsupported vcfg register {}",
2020                 self.debug_label(),
2021                 reg_idx
2022             ),
2023         };
2024     }
2025 
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])2026     fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
2027         if let Some(msix_cap) = &self.msix_cap {
2028             let msix_cap = msix_cap.lock();
2029             if msix_cap.is_msix_table(bar_index, offset) {
2030                 msix_cap.read_table(offset, data);
2031                 return;
2032             } else if msix_cap.is_msix_pba(bar_index, offset) {
2033                 msix_cap.read_pba(offset, data);
2034                 return;
2035             }
2036         }
2037         self.device.region_read(bar_index, data, offset);
2038     }
2039 
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])2040     fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
2041         // Ignore igd opregion's write
2042         if let Some(device_data) = &self.device_data {
2043             match *device_data {
2044                 DeviceData::IntelGfxData { opregion_index } => {
2045                     if opregion_index == bar_index as u32 {
2046                         return;
2047                     }
2048                 }
2049             }
2050         }
2051 
2052         if let Some(msix_cap) = &self.msix_cap {
2053             let mut msix_cap = msix_cap.lock();
2054             if msix_cap.is_msix_table(bar_index, offset) {
2055                 let behavior = msix_cap.write_table(offset, data);
2056                 if let MsixStatus::EntryChanged(index) = behavior {
2057                     let irqfd = msix_cap.get_msix_irqfd(index);
2058                     self.msix_vector_update(index, irqfd);
2059                 }
2060                 return;
2061             } else if msix_cap.is_msix_pba(bar_index, offset) {
2062                 msix_cap.write_pba(offset, data);
2063                 return;
2064             }
2065         }
2066 
2067         self.device.region_write(bar_index, data, offset);
2068     }
2069 
destroy_device(&mut self)2070     fn destroy_device(&mut self) {
2071         self.close();
2072     }
2073 
generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>)2074     fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2075         let mut amls = Vec::new();
2076         let mut shm = None;
2077         if let Some(pci_address) = self.pci_address {
2078             let vcfg_offset = pci_address.to_config_address(0, 13);
2079             if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2080                 vcfg_register.to_aml_bytes(&mut amls);
2081                 shm = vcfg_register
2082                     .create_shm_mmap()
2083                     .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2084                 self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2085                 // All vfio-pci devices should have virtual _PRx method, otherwise
2086                 // host couldn't know whether device has enter into suspend state,
2087                 // host would always think it is in active state, so its parent PCIe
2088                 // switch couldn't enter into suspend state.
2089                 PowerResourceMethod {}.to_aml_bytes(&mut amls);
2090                 // TODO: WIP: Ideally, we should generate DSM only if the physical
2091                 // device has a _DSM; however, such information is not provided by
2092                 // Linux. As a temporary workaround, we chech whether there is an
2093                 // associated ACPI companion device node and skip generating guest
2094                 // _DSM if there is none.
2095                 let acpi_path = self.sysfs_path.join("firmware_node/path");
2096                 if acpi_path.exists() {
2097                     DsmMethod {}.to_aml_bytes(&mut amls);
2098                 }
2099             }
2100         }
2101 
2102         (amls, shm)
2103     }
2104 
set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32>2105     fn set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32> {
2106         if let Some(gpe_nr) = resources.allocate_gpe() {
2107             base::debug!("set_gpe: gpe-nr {} addr {:?}", gpe_nr, self.pci_address);
2108             self.gpe = Some(gpe_nr);
2109         }
2110         self.gpe
2111     }
2112 }
2113 
2114 impl Suspendable for VfioPciDevice {
sleep(&mut self) -> anyhow::Result<()>2115     fn sleep(&mut self) -> anyhow::Result<()> {
2116         if let Some(worker_thread) = self.worker_thread.take() {
2117             let res = worker_thread.stop();
2118             self.pci_address = Some(res.address);
2119             self.sysfs_path = res.sysfs_path;
2120             self.pm_cap = res.pm_cap;
2121             self.msix_cap = res.msix_cap;
2122             self.vm_socket_vm = Some(res.vm_socket);
2123         }
2124         Ok(())
2125     }
2126 
wake(&mut self) -> anyhow::Result<()>2127     fn wake(&mut self) -> anyhow::Result<()> {
2128         if self.activated {
2129             self.start_work_thread();
2130         }
2131         Ok(())
2132     }
2133 }
2134 
2135 #[cfg(test)]
2136 mod tests {
2137     use resources::AddressRange;
2138 
2139     use super::VfioResourceAllocator;
2140 
2141     #[test]
no_overlap()2142     fn no_overlap() {
2143         // regions [32, 95]
2144         let mut memory =
2145             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2146         memory
2147             .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2148             .unwrap();
2149         memory
2150             .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2151             .unwrap();
2152 
2153         let mut iter = memory.regions.iter();
2154         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2155     }
2156 
2157     #[test]
complete_overlap()2158     fn complete_overlap() {
2159         // regions [32, 95]
2160         let mut memory =
2161             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2162         // regions [32, 47], [64, 95]
2163         memory
2164             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2165             .unwrap();
2166         // regions [64, 95]
2167         memory
2168             .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2169             .unwrap();
2170 
2171         let mut iter = memory.regions.iter();
2172         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2173     }
2174 
2175     #[test]
partial_overlap_one()2176     fn partial_overlap_one() {
2177         // regions [32, 95]
2178         let mut memory =
2179             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2180         // regions [32, 47], [64, 95]
2181         memory
2182             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2183             .unwrap();
2184         // regions [32, 39], [64, 95]
2185         memory
2186             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2187             .unwrap();
2188 
2189         let mut iter = memory.regions.iter();
2190         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2191         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2192     }
2193 
2194     #[test]
partial_overlap_two()2195     fn partial_overlap_two() {
2196         // regions [32, 95]
2197         let mut memory =
2198             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2199         // regions [32, 47], [64, 95]
2200         memory
2201             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2202             .unwrap();
2203         // regions [32, 39], [72, 95]
2204         memory
2205             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2206             .unwrap();
2207 
2208         let mut iter = memory.regions.iter();
2209         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2210         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2211     }
2212 
2213     #[test]
partial_overlap_three()2214     fn partial_overlap_three() {
2215         // regions [32, 95]
2216         let mut memory =
2217             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2218         // regions [32, 39], [48, 95]
2219         memory
2220             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2221             .unwrap();
2222         // regions [32, 39], [48, 63], [72, 95]
2223         memory
2224             .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2225             .unwrap();
2226         // regions [32, 35], [76, 95]
2227         memory
2228             .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2229             .unwrap();
2230 
2231         let mut iter = memory.regions.iter();
2232         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2233         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2234     }
2235 }
2236