• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::cmp::max;
6 use std::cmp::Reverse;
7 use std::collections::BTreeMap;
8 use std::collections::BTreeSet;
9 use std::fs;
10 use std::path::Path;
11 use std::path::PathBuf;
12 use std::str::FromStr;
13 use std::sync::Arc;
14 
15 use acpi_tables::aml::Aml;
16 use base::debug;
17 use base::error;
18 use base::pagesize;
19 use base::warn;
20 use base::AsRawDescriptor;
21 use base::AsRawDescriptors;
22 use base::Event;
23 use base::EventToken;
24 use base::MemoryMapping;
25 use base::Protection;
26 use base::RawDescriptor;
27 use base::Tube;
28 use base::WaitContext;
29 use base::WorkerThread;
30 use hypervisor::MemCacheType;
31 use resources::AddressRange;
32 use resources::Alloc;
33 use resources::AllocOptions;
34 use resources::MmioType;
35 use resources::SystemAllocator;
36 use sync::Mutex;
37 use vfio_sys::vfio::VFIO_PCI_ACPI_NTFY_IRQ_INDEX;
38 use vfio_sys::*;
39 use vm_control::api::VmMemoryClient;
40 use vm_control::HotPlugDeviceInfo;
41 use vm_control::HotPlugDeviceType;
42 use vm_control::VmMemoryDestination;
43 use vm_control::VmMemoryRegionId;
44 use vm_control::VmMemorySource;
45 use vm_control::VmRequest;
46 use vm_control::VmResponse;
47 
48 use crate::pci::acpi::DeviceVcfgRegister;
49 use crate::pci::acpi::DsmMethod;
50 use crate::pci::acpi::PowerResourceMethod;
51 use crate::pci::acpi::SHM_OFFSET;
52 use crate::pci::msi::MsiConfig;
53 use crate::pci::msi::MsiStatus;
54 use crate::pci::msi::PCI_MSI_FLAGS;
55 use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
56 use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
57 use crate::pci::msi::PCI_MSI_NEXT_POINTER;
58 use crate::pci::msix::MsixConfig;
59 use crate::pci::msix::MsixStatus;
60 use crate::pci::msix::BITS_PER_PBA_ENTRY;
61 use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
62 use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
63 use crate::pci::pci_device::BarRange;
64 use crate::pci::pci_device::Error as PciDeviceError;
65 use crate::pci::pci_device::PciDevice;
66 use crate::pci::pci_device::PreferredIrq;
67 use crate::pci::pm::PciPmCap;
68 use crate::pci::pm::PmConfig;
69 use crate::pci::pm::PM_CAP_LENGTH;
70 use crate::pci::PciAddress;
71 use crate::pci::PciBarConfiguration;
72 use crate::pci::PciBarIndex;
73 use crate::pci::PciBarPrefetchable;
74 use crate::pci::PciBarRegionType;
75 use crate::pci::PciCapabilityID;
76 use crate::pci::PciClassCode;
77 use crate::pci::PciId;
78 use crate::pci::PciInterruptPin;
79 use crate::pci::PCI_VCFG_DSM;
80 use crate::pci::PCI_VCFG_NOTY;
81 use crate::pci::PCI_VCFG_PM;
82 use crate::pci::PCI_VENDOR_ID_INTEL;
83 use crate::vfio::VfioDevice;
84 use crate::vfio::VfioError;
85 use crate::vfio::VfioIrqType;
86 use crate::vfio::VfioPciConfig;
87 use crate::IrqLevelEvent;
88 use crate::Suspendable;
89 
90 const PCI_VENDOR_ID: u32 = 0x0;
91 const PCI_DEVICE_ID: u32 = 0x2;
92 const PCI_COMMAND: u32 = 0x4;
93 const PCI_COMMAND_MEMORY: u8 = 0x2;
94 const PCI_BASE_CLASS_CODE: u32 = 0x0B;
95 const PCI_INTERRUPT_NUM: u32 = 0x3C;
96 const PCI_INTERRUPT_PIN: u32 = 0x3D;
97 
98 const PCI_CAPABILITY_LIST: u32 = 0x34;
99 const PCI_CAP_ID_MSI: u8 = 0x05;
100 const PCI_CAP_ID_MSIX: u8 = 0x11;
101 const PCI_CAP_ID_PM: u8 = 0x01;
102 
103 // Size of the standard PCI config space
104 const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
105 // Size of the standard PCIe config space: 4KB
106 const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
107 
108 // Extended Capabilities
109 const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
110 const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
111 const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
112 const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
113 
114 struct VfioPmCap {
115     offset: u32,
116     capabilities: u32,
117     config: PmConfig,
118 }
119 
120 impl VfioPmCap {
new(config: &VfioPciConfig, cap_start: u32) -> Self121     fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
122         let mut capabilities: u32 = config.read_config(cap_start);
123         capabilities |= (PciPmCap::default_cap() as u32) << 16;
124         VfioPmCap {
125             offset: cap_start,
126             capabilities,
127             config: PmConfig::new(false),
128         }
129     }
130 
should_trigger_pme(&mut self) -> bool131     pub fn should_trigger_pme(&mut self) -> bool {
132         self.config.should_trigger_pme()
133     }
134 
is_pm_reg(&self, offset: u32) -> bool135     fn is_pm_reg(&self, offset: u32) -> bool {
136         (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
137     }
138 
read(&self, offset: u32) -> u32139     pub fn read(&self, offset: u32) -> u32 {
140         let offset = offset - self.offset;
141         if offset == 0 {
142             self.capabilities
143         } else {
144             let mut data = 0;
145             self.config.read(&mut data);
146             data
147         }
148     }
149 
write(&mut self, offset: u64, data: &[u8])150     pub fn write(&mut self, offset: u64, data: &[u8]) {
151         let offset = offset - self.offset as u64;
152         if offset >= std::mem::size_of::<u32>() as u64 {
153             let offset = offset - std::mem::size_of::<u32>() as u64;
154             self.config.write(offset, data);
155         }
156     }
157 }
158 
159 enum VfioMsiChange {
160     Disable,
161     Enable,
162     FunctionChanged,
163 }
164 
165 struct VfioMsiCap {
166     config: MsiConfig,
167     offset: u32,
168 }
169 
170 impl VfioMsiCap {
new( config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube, device_id: u32, device_name: String, ) -> Self171     fn new(
172         config: &VfioPciConfig,
173         msi_cap_start: u32,
174         vm_socket_irq: Tube,
175         device_id: u32,
176         device_name: String,
177     ) -> Self {
178         let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
179         let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
180         let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
181 
182         VfioMsiCap {
183             config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
184             offset: msi_cap_start,
185         }
186     }
187 
is_msi_reg(&self, index: u64, len: usize) -> bool188     fn is_msi_reg(&self, index: u64, len: usize) -> bool {
189         self.config.is_msi_reg(self.offset, index, len)
190     }
191 
write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>192     fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
193         let offset = index as u32 - self.offset;
194         match self.config.write_msi_capability(offset, data) {
195             MsiStatus::Enabled => Some(VfioMsiChange::Enable),
196             MsiStatus::Disabled => Some(VfioMsiChange::Disable),
197             MsiStatus::NothingToDo => None,
198         }
199     }
200 
get_msi_irqfd(&self) -> Option<&Event>201     fn get_msi_irqfd(&self) -> Option<&Event> {
202         self.config.get_irqfd()
203     }
204 
destroy(&mut self)205     fn destroy(&mut self) {
206         self.config.destroy()
207     }
208 }
209 
210 // MSI-X registers in MSI-X capability
211 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
212 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
213 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
214 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
215 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
216 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
217 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
218 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
219 
220 struct VfioMsixCap {
221     config: MsixConfig,
222     offset: u32,
223     table_size: u16,
224     table_pci_bar: PciBarIndex,
225     table_offset: u64,
226     table_size_bytes: u64,
227     pba_pci_bar: PciBarIndex,
228     pba_offset: u64,
229     pba_size_bytes: u64,
230     msix_interrupt_evt: Vec<Event>,
231 }
232 
233 impl VfioMsixCap {
new( config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube, pci_id: u32, device_name: String, ) -> Self234     fn new(
235         config: &VfioPciConfig,
236         msix_cap_start: u32,
237         vm_socket_irq: Tube,
238         pci_id: u32,
239         device_name: String,
240     ) -> Self {
241         let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
242         let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
243         let table_pci_bar = (table & PCI_MSIX_TABLE_BIR) as PciBarIndex;
244         let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
245         let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
246         let pba_pci_bar = (pba & PCI_MSIX_PBA_BIR) as PciBarIndex;
247         let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
248 
249         let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
250         if table_pci_bar == pba_pci_bar
251             && pba_offset > table_offset
252             && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
253         {
254             table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
255         }
256 
257         let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
258         let pba_size_bytes =
259             table_size.div_ceil(BITS_PER_PBA_ENTRY as u64) * MSIX_PBA_ENTRIES_MODULO;
260         let mut msix_interrupt_evt = Vec::new();
261         for _ in 0..table_size {
262             msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
263         }
264         VfioMsixCap {
265             config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
266             offset: msix_cap_start,
267             table_size: table_size as u16,
268             table_pci_bar,
269             table_offset,
270             table_size_bytes,
271             pba_pci_bar,
272             pba_offset,
273             pba_size_bytes,
274             msix_interrupt_evt,
275         }
276     }
277 
278     // only msix control register is writable and need special handle in pci r/w
is_msix_control_reg(&self, offset: u32, size: u32) -> bool279     fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
280         let control_start = self.offset + PCI_MSIX_FLAGS;
281         let control_end = control_start + 2;
282 
283         offset < control_end && offset + size > control_start
284     }
285 
read_msix_control(&self, data: &mut u32)286     fn read_msix_control(&self, data: &mut u32) {
287         *data = self.config.read_msix_capability(*data);
288     }
289 
write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>290     fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
291         let old_enabled = self.config.enabled();
292         let old_masked = self.config.masked();
293 
294         self.config
295             .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
296 
297         let new_enabled = self.config.enabled();
298         let new_masked = self.config.masked();
299 
300         if !old_enabled && new_enabled {
301             Some(VfioMsiChange::Enable)
302         } else if old_enabled && !new_enabled {
303             Some(VfioMsiChange::Disable)
304         } else if new_enabled && old_masked != new_masked {
305             Some(VfioMsiChange::FunctionChanged)
306         } else {
307             None
308         }
309     }
310 
is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool311     fn is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool {
312         bar_index == self.table_pci_bar
313             && offset >= self.table_offset
314             && offset < self.table_offset + self.table_size_bytes
315     }
316 
get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange>317     fn get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
318         if bar_index == self.table_pci_bar {
319             AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
320         } else {
321             None
322         }
323     }
324 
read_table(&self, offset: u64, data: &mut [u8])325     fn read_table(&self, offset: u64, data: &mut [u8]) {
326         let offset = offset - self.table_offset;
327         self.config.read_msix_table(offset, data);
328     }
329 
write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus330     fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
331         let offset = offset - self.table_offset;
332         self.config.write_msix_table(offset, data)
333     }
334 
is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool335     fn is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool {
336         bar_index == self.pba_pci_bar
337             && offset >= self.pba_offset
338             && offset < self.pba_offset + self.pba_size_bytes
339     }
340 
get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange>341     fn get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
342         if bar_index == self.pba_pci_bar {
343             AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
344         } else {
345             None
346         }
347     }
348 
read_pba(&self, offset: u64, data: &mut [u8])349     fn read_pba(&self, offset: u64, data: &mut [u8]) {
350         let offset = offset - self.pba_offset;
351         self.config.read_pba_entries(offset, data);
352     }
353 
write_pba(&mut self, offset: u64, data: &[u8])354     fn write_pba(&mut self, offset: u64, data: &[u8]) {
355         let offset = offset - self.pba_offset;
356         self.config.write_pba_entries(offset, data);
357     }
358 
get_msix_irqfd(&self, index: usize) -> Option<&Event>359     fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
360         let irqfd = self.config.get_irqfd(index);
361         if let Some(fd) = irqfd {
362             if self.msix_vector_masked(index) {
363                 Some(&self.msix_interrupt_evt[index])
364             } else {
365                 Some(fd)
366             }
367         } else {
368             None
369         }
370     }
371 
get_msix_irqfds(&self) -> Vec<Option<&Event>>372     fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
373         let mut irqfds = Vec::new();
374 
375         for i in 0..self.table_size {
376             irqfds.push(self.get_msix_irqfd(i as usize));
377         }
378 
379         irqfds
380     }
381 
table_size(&self) -> usize382     fn table_size(&self) -> usize {
383         self.table_size.into()
384     }
385 
clone_msix_evt(&self) -> Vec<Event>386     fn clone_msix_evt(&self) -> Vec<Event> {
387         self.msix_interrupt_evt
388             .iter()
389             .map(|irq| irq.try_clone().unwrap())
390             .collect()
391     }
392 
msix_vector_masked(&self, index: usize) -> bool393     fn msix_vector_masked(&self, index: usize) -> bool {
394         !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
395     }
396 
trigger(&mut self, index: usize)397     fn trigger(&mut self, index: usize) {
398         self.config.trigger(index as u16);
399     }
400 
destroy(&mut self)401     fn destroy(&mut self) {
402         self.config.destroy()
403     }
404 }
405 
406 struct VfioResourceAllocator {
407     // The region that is not allocated yet.
408     regions: BTreeSet<AddressRange>,
409 }
410 
411 impl VfioResourceAllocator {
412     // Creates a new `VfioResourceAllocator` for managing VFIO resources.
413     // Can return `Err` if `base` + `size` overflows a u64.
414     //
415     // * `base` - The starting address of the range to manage.
416     // * `size` - The size of the address range in bytes.
new(pool: AddressRange) -> Result<Self, PciDeviceError>417     fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
418         if pool.is_empty() {
419             return Err(PciDeviceError::SizeZero);
420         }
421         let mut regions = BTreeSet::new();
422         regions.insert(pool);
423         Ok(VfioResourceAllocator { regions })
424     }
425 
internal_allocate_from_slot( &mut self, slot: AddressRange, range: AddressRange, ) -> Result<u64, PciDeviceError>426     fn internal_allocate_from_slot(
427         &mut self,
428         slot: AddressRange,
429         range: AddressRange,
430     ) -> Result<u64, PciDeviceError> {
431         let slot_was_present = self.regions.remove(&slot);
432         assert!(slot_was_present);
433 
434         let (before, after) = slot.non_overlapping_ranges(range);
435 
436         if !before.is_empty() {
437             self.regions.insert(before);
438         }
439         if !after.is_empty() {
440             self.regions.insert(after);
441         }
442 
443         Ok(range.start)
444     }
445 
446     // Allocates a range of addresses from the managed region with a minimal alignment.
447     // Overlapping with a previous allocation is _not_ allowed.
448     // Returns allocated address.
allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError>449     fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
450         if size == 0 {
451             return Err(PciDeviceError::SizeZero);
452         }
453         if !alignment.is_power_of_two() {
454             return Err(PciDeviceError::BadAlignment);
455         }
456 
457         // finds first region matching alignment and size.
458         let region = self.regions.iter().find(|range| {
459             match range.start % alignment {
460                 0 => range.start.checked_add(size - 1),
461                 r => range.start.checked_add(size - 1 + alignment - r),
462             }
463             .is_some_and(|end| end <= range.end)
464         });
465 
466         match region {
467             Some(&slot) => {
468                 let start = match slot.start % alignment {
469                     0 => slot.start,
470                     r => slot.start + alignment - r,
471                 };
472                 let end = start + size - 1;
473                 let range = AddressRange::from_start_and_end(start, end);
474 
475                 self.internal_allocate_from_slot(slot, range)
476             }
477             None => Err(PciDeviceError::OutOfSpace),
478         }
479     }
480 
481     // Allocates a range of addresses from the managed region with a required location.
482     // Overlapping with a previous allocation is allowed.
allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError>483     fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
484         if range.is_empty() {
485             return Err(PciDeviceError::SizeZero);
486         }
487 
488         while let Some(&slot) = self
489             .regions
490             .iter()
491             .find(|avail_range| avail_range.overlaps(range))
492         {
493             let _address = self.internal_allocate_from_slot(slot, range)?;
494         }
495         Ok(())
496     }
497 }
498 
499 struct VfioPciWorker {
500     address: PciAddress,
501     sysfs_path: PathBuf,
502     vm_socket: Tube,
503     name: String,
504     pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
505     msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
506 }
507 
508 impl VfioPciWorker {
run( &mut self, req_irq_evt: Event, wakeup_evt: Event, acpi_notify_evt: Event, kill_evt: Event, msix_evt: Vec<Event>, is_in_low_power: Arc<Mutex<bool>>, gpe: Option<u32>, notification_val: Arc<Mutex<Vec<u32>>>, )509     fn run(
510         &mut self,
511         req_irq_evt: Event,
512         wakeup_evt: Event,
513         acpi_notify_evt: Event,
514         kill_evt: Event,
515         msix_evt: Vec<Event>,
516         is_in_low_power: Arc<Mutex<bool>>,
517         gpe: Option<u32>,
518         notification_val: Arc<Mutex<Vec<u32>>>,
519     ) {
520         #[derive(EventToken, Debug)]
521         enum Token {
522             ReqIrq,
523             WakeUp,
524             AcpiNotifyEvent,
525             Kill,
526             MsixIrqi { index: usize },
527         }
528 
529         let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
530             (&req_irq_evt, Token::ReqIrq),
531             (&wakeup_evt, Token::WakeUp),
532             (&acpi_notify_evt, Token::AcpiNotifyEvent),
533             (&kill_evt, Token::Kill),
534         ]) {
535             Ok(pc) => pc,
536             Err(e) => {
537                 error!(
538                     "{} failed creating vfio WaitContext: {}",
539                     self.name.clone(),
540                     e
541                 );
542                 return;
543             }
544         };
545 
546         for (index, msix_int) in msix_evt.iter().enumerate() {
547             wait_ctx
548                 .add(msix_int, Token::MsixIrqi { index })
549                 .expect("Failed to create vfio WaitContext for msix interrupt event")
550         }
551 
552         'wait: loop {
553             let events = match wait_ctx.wait() {
554                 Ok(v) => v,
555                 Err(e) => {
556                     error!("{} failed polling vfio events: {}", self.name.clone(), e);
557                     break;
558                 }
559             };
560 
561             for event in events.iter().filter(|e| e.is_readable) {
562                 match event.token {
563                     Token::MsixIrqi { index } => {
564                         if let Some(msix_cap) = &self.msix_cap {
565                             msix_cap.lock().trigger(index);
566                         }
567                     }
568                     Token::ReqIrq => {
569                         let device = HotPlugDeviceInfo {
570                             device_type: HotPlugDeviceType::EndPoint,
571                             path: self.sysfs_path.clone(),
572                             hp_interrupt: false,
573                         };
574 
575                         let request = VmRequest::HotPlugVfioCommand { device, add: false };
576                         if self.vm_socket.send(&request).is_ok() {
577                             if let Err(e) = self.vm_socket.recv::<VmResponse>() {
578                                 error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
579                             } else {
580                                 break 'wait;
581                             }
582                         }
583                     }
584                     Token::WakeUp => {
585                         let _ = wakeup_evt.wait();
586 
587                         if *is_in_low_power.lock() {
588                             if let Some(pm_cap) = &self.pm_cap {
589                                 if pm_cap.lock().should_trigger_pme() {
590                                     let request =
591                                         VmRequest::PciPme(self.address.pme_requester_id());
592                                     if self.vm_socket.send(&request).is_ok() {
593                                         if let Err(e) = self.vm_socket.recv::<VmResponse>() {
594                                             error!(
595                                                 "{} failed to send PME: {}",
596                                                 self.name.clone(),
597                                                 e
598                                             );
599                                         }
600                                     }
601                                 }
602                             }
603                         }
604                     }
605                     Token::AcpiNotifyEvent => {
606                         if let Some(gpe) = gpe {
607                             if let Ok(val) = base::EventExt::read_count(&acpi_notify_evt) {
608                                 notification_val.lock().push(val as u32);
609                                 let request = VmRequest::Gpe {
610                                     gpe,
611                                     clear_evt: None,
612                                 };
613                                 if self.vm_socket.send(&request).is_ok() {
614                                     if let Err(e) = self.vm_socket.recv::<VmResponse>() {
615                                         error!("{} failed to send GPE: {}", self.name.clone(), e);
616                                     }
617                                 }
618                             } else {
619                                 error!("{} failed to read acpi_notify_evt", self.name.clone());
620                             }
621                         }
622                     }
623                     Token::Kill => break 'wait,
624                 }
625             }
626         }
627     }
628 }
629 
get_next_from_extcap_header(cap_header: u32) -> u32630 fn get_next_from_extcap_header(cap_header: u32) -> u32 {
631     (cap_header >> 20) & 0xffc
632 }
633 
is_skipped_ext_cap(cap_id: u16) -> bool634 fn is_skipped_ext_cap(cap_id: u16) -> bool {
635     matches!(
636         cap_id,
637         // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
638         PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
639     )
640 }
641 
642 enum DeviceData {
643     IntelGfxData { opregion_index: u32 },
644 }
645 
646 /// PCI Express Extended Capabilities information
647 #[derive(Copy, Clone)]
648 struct ExtCap {
649     /// cap offset in Configuration Space
650     offset: u32,
651     /// cap size
652     size: u32,
653     /// next offset, set next non-skipped offset for non-skipped ext cap
654     next: u16,
655     /// whether to be exposed to guest
656     is_skipped: bool,
657 }
658 
659 /// Implements the Vfio Pci device, then a pci device is added into vm
660 pub struct VfioPciDevice {
661     device: Arc<VfioDevice>,
662     config: VfioPciConfig,
663     hotplug: bool,
664     hotplug_bus_number: Option<u8>,
665     preferred_address: PciAddress,
666     pci_address: Option<PciAddress>,
667     interrupt_evt: Option<IrqLevelEvent>,
668     acpi_notification_evt: Option<Event>,
669     mmio_regions: Vec<PciBarConfiguration>,
670     io_regions: Vec<PciBarConfiguration>,
671     pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
672     msi_cap: Option<VfioMsiCap>,
673     msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
674     irq_type: Option<VfioIrqType>,
675     vm_memory_client: VmMemoryClient,
676     device_data: Option<DeviceData>,
677     pm_evt: Option<Event>,
678     is_in_low_power: Arc<Mutex<bool>>,
679     worker_thread: Option<WorkerThread<VfioPciWorker>>,
680     vm_socket_vm: Option<Tube>,
681     sysfs_path: PathBuf,
682     // PCI Express Extended Capabilities
683     ext_caps: Vec<ExtCap>,
684     vcfg_shm_mmap: Option<MemoryMapping>,
685     mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<VmMemoryRegionId>)>,
686     activated: bool,
687     acpi_notifier_val: Arc<Mutex<Vec<u32>>>,
688     gpe: Option<u32>,
689     base_class_code: PciClassCode,
690 }
691 
692 impl VfioPciDevice {
693     /// Constructs a new Vfio Pci device for the give Vfio device
new( sysfs_path: &Path, device: VfioDevice, hotplug: bool, hotplug_bus_number: Option<u8>, guest_address: Option<PciAddress>, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vm_memory_client: VmMemoryClient, vfio_device_socket_vm: Tube, ) -> Result<Self, PciDeviceError>694     pub fn new(
695         sysfs_path: &Path,
696         device: VfioDevice,
697         hotplug: bool,
698         hotplug_bus_number: Option<u8>,
699         guest_address: Option<PciAddress>,
700         vfio_device_socket_msi: Tube,
701         vfio_device_socket_msix: Tube,
702         vm_memory_client: VmMemoryClient,
703         vfio_device_socket_vm: Tube,
704     ) -> Result<Self, PciDeviceError> {
705         let preferred_address = if let Some(bus_num) = hotplug_bus_number {
706             debug!("hotplug bus {}", bus_num);
707             PciAddress {
708                 // Caller specify pcie bus number for hotplug device
709                 bus: bus_num,
710                 // devfn should be 0, otherwise pcie root port couldn't detect it
711                 dev: 0,
712                 func: 0,
713             }
714         } else if let Some(guest_address) = guest_address {
715             debug!("guest PCI address {}", guest_address);
716             guest_address
717         } else {
718             let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
719                 PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
720             })?;
721             debug!("parsed device PCI address {}", addr);
722             addr
723         };
724 
725         let dev = Arc::new(device);
726         let config = VfioPciConfig::new(Arc::clone(&dev));
727         let mut msi_socket = Some(vfio_device_socket_msi);
728         let mut msix_socket = Some(vfio_device_socket_msix);
729         let mut msi_cap: Option<VfioMsiCap> = None;
730         let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
731         let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
732 
733         let mut is_pcie = false;
734         let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
735         let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
736         let device_id: u16 = config.read_config(PCI_DEVICE_ID);
737         let base_class_code = PciClassCode::try_from(config.read_config::<u8>(PCI_BASE_CLASS_CODE))
738             .unwrap_or(PciClassCode::Other);
739 
740         let pci_id = PciId::new(vendor_id, device_id);
741 
742         while cap_next != 0 {
743             let cap_id: u8 = config.read_config(cap_next);
744             if cap_id == PCI_CAP_ID_PM {
745                 pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
746             } else if cap_id == PCI_CAP_ID_MSI {
747                 if let Some(msi_socket) = msi_socket.take() {
748                     msi_cap = Some(VfioMsiCap::new(
749                         &config,
750                         cap_next,
751                         msi_socket,
752                         pci_id.into(),
753                         dev.device_name().to_string(),
754                     ));
755                 }
756             } else if cap_id == PCI_CAP_ID_MSIX {
757                 if let Some(msix_socket) = msix_socket.take() {
758                     msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
759                         &config,
760                         cap_next,
761                         msix_socket,
762                         pci_id.into(),
763                         dev.device_name().to_string(),
764                     ))));
765                 }
766             } else if cap_id == PciCapabilityID::PciExpress as u8 {
767                 is_pcie = true;
768             }
769             let offset = cap_next + PCI_MSI_NEXT_POINTER;
770             cap_next = config.read_config::<u8>(offset).into();
771         }
772 
773         let mut ext_caps: Vec<ExtCap> = Vec::new();
774         if is_pcie {
775             let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
776             while ext_cap_next != 0 {
777                 let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
778                 if ext_cap_config == 0 {
779                     break;
780                 }
781                 ext_caps.push(ExtCap {
782                     offset: ext_cap_next,
783                     // Calculate the size later
784                     size: 0,
785                     // init as the real value
786                     next: get_next_from_extcap_header(ext_cap_config) as u16,
787                     is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
788                 });
789                 ext_cap_next = get_next_from_extcap_header(ext_cap_config);
790             }
791 
792             // Manage extended caps
793             //
794             // Extended capabilities are chained with each pointing to the next, so
795             // we can drop anything other than the head of the chain simply by
796             // modifying the previous next pointer. For the head of the chain, we
797             // can modify the capability ID to something that cannot match a valid
798             // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
799             // supported.
800             //
801             // reverse order by offset
802             ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
803             let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
804             let mut non_skipped_next: u16 = 0;
805             for ext_cap in ext_caps.iter_mut() {
806                 if !ext_cap.is_skipped {
807                     ext_cap.next = non_skipped_next;
808                     non_skipped_next = ext_cap.offset as u16;
809                 } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
810                     ext_cap.next = non_skipped_next;
811                 }
812                 ext_cap.size = next_offset - ext_cap.offset;
813                 next_offset = ext_cap.offset;
814             }
815             // order by offset
816             ext_caps.reverse();
817         }
818 
819         let is_intel_gfx =
820             base_class_code == PciClassCode::DisplayController && vendor_id == PCI_VENDOR_ID_INTEL;
821         let device_data = if is_intel_gfx {
822             Some(DeviceData::IntelGfxData {
823                 opregion_index: u32::MAX,
824             })
825         } else {
826             None
827         };
828 
829         Ok(VfioPciDevice {
830             device: dev,
831             config,
832             hotplug,
833             hotplug_bus_number,
834             preferred_address,
835             pci_address: None,
836             interrupt_evt: None,
837             acpi_notification_evt: None,
838             mmio_regions: Vec::new(),
839             io_regions: Vec::new(),
840             pm_cap,
841             msi_cap,
842             msix_cap,
843             irq_type: None,
844             vm_memory_client,
845             device_data,
846             pm_evt: None,
847             is_in_low_power: Arc::new(Mutex::new(false)),
848             worker_thread: None,
849             vm_socket_vm: Some(vfio_device_socket_vm),
850             sysfs_path: sysfs_path.to_path_buf(),
851             ext_caps,
852             vcfg_shm_mmap: None,
853             mapped_mmio_bars: BTreeMap::new(),
854             activated: false,
855             acpi_notifier_val: Arc::new(Mutex::new(Vec::new())),
856             gpe: None,
857             base_class_code,
858         })
859     }
860 
861     /// Gets the pci address of the device, if one has already been allocated.
pci_address(&self) -> Option<PciAddress>862     pub fn pci_address(&self) -> Option<PciAddress> {
863         self.pci_address
864     }
865 
is_gfx(&self) -> bool866     pub fn is_gfx(&self) -> bool {
867         self.base_class_code == PciClassCode::DisplayController
868     }
869 
is_intel_gfx(&self) -> bool870     fn is_intel_gfx(&self) -> bool {
871         matches!(self.device_data, Some(DeviceData::IntelGfxData { .. }))
872     }
873 
enable_acpi_notification(&mut self) -> Result<(), PciDeviceError>874     fn enable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
875         if let Some(ref acpi_notification_evt) = self.acpi_notification_evt {
876             return self
877                 .device
878                 .acpi_notification_evt_enable(acpi_notification_evt, VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
879                 .map_err(|_| PciDeviceError::AcpiNotifySetupFailed);
880         }
881         Err(PciDeviceError::AcpiNotifySetupFailed)
882     }
883 
884     #[allow(dead_code)]
disable_acpi_notification(&mut self) -> Result<(), PciDeviceError>885     fn disable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
886         if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
887             return self
888                 .device
889                 .acpi_notification_disable(VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
890                 .map_err(|_| PciDeviceError::AcpiNotifyDeactivationFailed);
891         }
892         Err(PciDeviceError::AcpiNotifyDeactivationFailed)
893     }
894 
895     #[allow(dead_code)]
test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError>896     fn test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError> {
897         if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
898             return self
899                 .device
900                 .acpi_notification_test(VFIO_PCI_ACPI_NTFY_IRQ_INDEX, val)
901                 .map_err(|_| PciDeviceError::AcpiNotifyTestFailed);
902         }
903         Err(PciDeviceError::AcpiNotifyTestFailed)
904     }
905 
enable_intx(&mut self)906     fn enable_intx(&mut self) {
907         if let Some(ref interrupt_evt) = self.interrupt_evt {
908             if let Err(e) = self.device.irq_enable(
909                 &[Some(interrupt_evt.get_trigger())],
910                 VFIO_PCI_INTX_IRQ_INDEX,
911                 0,
912             ) {
913                 error!("{} Intx enable failed: {}", self.debug_label(), e);
914                 return;
915             }
916             if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
917                 error!("{} Intx mask failed: {}", self.debug_label(), e);
918                 self.disable_intx();
919                 return;
920             }
921             if let Err(e) = self
922                 .device
923                 .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
924             {
925                 error!("{} resample enable failed: {}", self.debug_label(), e);
926                 self.disable_intx();
927                 return;
928             }
929             if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
930                 error!("{} Intx unmask failed: {}", self.debug_label(), e);
931                 self.disable_intx();
932                 return;
933             }
934             self.irq_type = Some(VfioIrqType::Intx);
935         }
936     }
937 
disable_intx(&mut self)938     fn disable_intx(&mut self) {
939         if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
940             error!("{} Intx disable failed: {}", self.debug_label(), e);
941         }
942         self.irq_type = None;
943     }
944 
disable_irqs(&mut self)945     fn disable_irqs(&mut self) {
946         match self.irq_type {
947             Some(VfioIrqType::Msi) => self.disable_msi(),
948             Some(VfioIrqType::Msix) => self.disable_msix(),
949             _ => (),
950         }
951 
952         // Above disable_msi() or disable_msix() will enable intx again.
953         // so disable_intx here again.
954         if let Some(VfioIrqType::Intx) = self.irq_type {
955             self.disable_intx();
956         }
957     }
958 
enable_msi(&mut self)959     fn enable_msi(&mut self) {
960         self.disable_irqs();
961 
962         let irqfd = match &self.msi_cap {
963             Some(cap) => {
964                 if let Some(fd) = cap.get_msi_irqfd() {
965                     fd
966                 } else {
967                     self.enable_intx();
968                     return;
969                 }
970             }
971             None => {
972                 self.enable_intx();
973                 return;
974             }
975         };
976 
977         if let Err(e) = self
978             .device
979             .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
980         {
981             error!("{} failed to enable msi: {}", self.debug_label(), e);
982             self.enable_intx();
983             return;
984         }
985 
986         self.irq_type = Some(VfioIrqType::Msi);
987     }
988 
disable_msi(&mut self)989     fn disable_msi(&mut self) {
990         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
991             error!("{} failed to disable msi: {}", self.debug_label(), e);
992             return;
993         }
994         self.irq_type = None;
995 
996         self.enable_intx();
997     }
998 
enable_msix(&mut self)999     fn enable_msix(&mut self) {
1000         if self.msix_cap.is_none() {
1001             return;
1002         }
1003 
1004         self.disable_irqs();
1005         let cap = self.msix_cap.as_ref().unwrap().lock();
1006         let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1007 
1008         let mut failed = false;
1009         if !vector_in_use {
1010             // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1011             // to vector 0. Then we enable it and immediately disable it, so that vfio will
1012             // activate physical device. If there are available msix vectors, just enable them
1013             // instead.
1014             let fd = Event::new().expect("failed to create event");
1015             let table_size = cap.table_size();
1016             let mut irqfds = vec![None; table_size];
1017             irqfds[0] = Some(&fd);
1018             for fd in irqfds.iter_mut().skip(1) {
1019                 *fd = None;
1020             }
1021             if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1022                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1023                 failed = true;
1024             }
1025             irqfds[0] = None;
1026             if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1027                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1028                 failed = true;
1029             }
1030         } else {
1031             let result = self
1032                 .device
1033                 .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1034             if let Err(e) = result {
1035                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1036                 failed = true;
1037             }
1038         }
1039 
1040         std::mem::drop(cap);
1041         if failed {
1042             self.enable_intx();
1043             return;
1044         }
1045         self.irq_type = Some(VfioIrqType::Msix);
1046     }
1047 
disable_msix(&mut self)1048     fn disable_msix(&mut self) {
1049         if self.msix_cap.is_none() {
1050             return;
1051         }
1052         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1053             error!("{} failed to disable msix: {}", self.debug_label(), e);
1054             return;
1055         }
1056         self.irq_type = None;
1057         self.enable_intx();
1058     }
1059 
msix_vectors_update(&self) -> Result<(), VfioError>1060     fn msix_vectors_update(&self) -> Result<(), VfioError> {
1061         if let Some(cap) = &self.msix_cap {
1062             self.device
1063                 .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1064         }
1065         Ok(())
1066     }
1067 
msix_vector_update(&self, index: usize, irqfd: Option<&Event>)1068     fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1069         if let Err(e) = self
1070             .device
1071             .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1072         {
1073             error!(
1074                 "{} failed to update msix vector {}: {}",
1075                 self.debug_label(),
1076                 index,
1077                 e
1078             );
1079         }
1080     }
1081 
adjust_bar_mmap( &self, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, remove_mmaps: &[AddressRange], ) -> Vec<vfio_region_sparse_mmap_area>1082     fn adjust_bar_mmap(
1083         &self,
1084         bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1085         remove_mmaps: &[AddressRange],
1086     ) -> Vec<vfio_region_sparse_mmap_area> {
1087         let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1088         let pgmask = (pagesize() as u64) - 1;
1089 
1090         for mmap in bar_mmaps.iter() {
1091             let mmap_range = if let Some(mmap_range) =
1092                 AddressRange::from_start_and_size(mmap.offset, mmap.size)
1093             {
1094                 mmap_range
1095             } else {
1096                 continue;
1097             };
1098             let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1099                 Ok(a) => a,
1100                 Err(e) => {
1101                     error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1102                     mmaps.clear();
1103                     return mmaps;
1104                 }
1105             };
1106 
1107             for &(mut remove_range) in remove_mmaps.iter() {
1108                 remove_range = remove_range.intersect(mmap_range);
1109                 if !remove_range.is_empty() {
1110                     // align offsets to page size
1111                     let begin = remove_range.start & !pgmask;
1112                     let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1113                     let remove_range = AddressRange::from_start_and_end(begin, end);
1114                     if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1115                         error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1116                     }
1117                 }
1118             }
1119 
1120             for mmap in to_mmap.regions {
1121                 mmaps.push(vfio_region_sparse_mmap_area {
1122                     offset: mmap.start,
1123                     size: mmap.end - mmap.start + 1,
1124                 });
1125             }
1126         }
1127 
1128         mmaps
1129     }
1130 
remove_bar_mmap_msix( &self, bar_index: PciBarIndex, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1131     fn remove_bar_mmap_msix(
1132         &self,
1133         bar_index: PciBarIndex,
1134         bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1135     ) -> Vec<vfio_region_sparse_mmap_area> {
1136         let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1137         let mut msix_regions = Vec::new();
1138 
1139         if let Some(t) = msix_cap.get_msix_table(bar_index) {
1140             msix_regions.push(t);
1141         }
1142         if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1143             msix_regions.push(p);
1144         }
1145 
1146         if msix_regions.is_empty() {
1147             return bar_mmaps;
1148         }
1149 
1150         self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1151     }
1152 
add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId>1153     fn add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId> {
1154         let mut mmaps_ids: Vec<VmMemoryRegionId> = Vec::new();
1155         if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1156             // the bar storing msix table and pba couldn't mmap.
1157             // these bars should be trapped, so that msix could be emulated.
1158             let mut mmaps = self.device.get_region_mmap(index);
1159 
1160             if self.msix_cap.is_some() {
1161                 mmaps = self.remove_bar_mmap_msix(index, mmaps);
1162             }
1163             if mmaps.is_empty() {
1164                 return mmaps_ids;
1165             }
1166 
1167             for mmap in mmaps.iter() {
1168                 let mmap_offset = mmap.offset;
1169                 let mmap_size = mmap.size;
1170                 let guest_map_start = bar_addr + mmap_offset;
1171                 let region_offset = self.device.get_region_offset(index);
1172                 let offset = region_offset + mmap_offset;
1173                 let descriptor = match self.device.device_file().try_clone() {
1174                     Ok(device_file) => device_file.into(),
1175                     Err(_) => break,
1176                 };
1177                 match self.vm_memory_client.register_memory(
1178                     VmMemorySource::Descriptor {
1179                         descriptor,
1180                         offset,
1181                         size: mmap_size,
1182                     },
1183                     VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1184                     Protection::read_write(),
1185                     MemCacheType::CacheCoherent,
1186                 ) {
1187                     Ok(id) => {
1188                         mmaps_ids.push(id);
1189                     }
1190                     Err(e) => {
1191                         error!("register_memory failed: {}", e);
1192                         break;
1193                     }
1194                 }
1195             }
1196         }
1197 
1198         mmaps_ids
1199     }
1200 
remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId])1201     fn remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId]) {
1202         for mmap_id in mmap_ids {
1203             if let Err(e) = self.vm_memory_client.unregister_memory(*mmap_id) {
1204                 error!("unregister_memory failed: {}", e);
1205             }
1206         }
1207     }
1208 
disable_bars_mmap(&mut self)1209     fn disable_bars_mmap(&mut self) {
1210         for (_, (_, mmap_ids)) in self.mapped_mmio_bars.iter() {
1211             self.remove_bar_mmap(mmap_ids);
1212         }
1213         self.mapped_mmio_bars.clear();
1214     }
1215 
commit_bars_mmap(&mut self)1216     fn commit_bars_mmap(&mut self) {
1217         // Unmap all bars before remapping bars, to prevent issues with overlap
1218         let mut needs_map = Vec::new();
1219         for mmio_info in self.mmio_regions.iter() {
1220             let bar_idx = mmio_info.bar_index();
1221             let addr = mmio_info.address();
1222 
1223             if let Some((cur_addr, ids)) = self.mapped_mmio_bars.remove(&bar_idx) {
1224                 if cur_addr == addr {
1225                     self.mapped_mmio_bars.insert(bar_idx, (cur_addr, ids));
1226                     continue;
1227                 } else {
1228                     self.remove_bar_mmap(&ids);
1229                 }
1230             }
1231 
1232             if addr != 0 {
1233                 needs_map.push((bar_idx, addr));
1234             }
1235         }
1236 
1237         for (bar_idx, addr) in needs_map.iter() {
1238             let ids = self.add_bar_mmap(*bar_idx, *addr);
1239             self.mapped_mmio_bars.insert(*bar_idx, (*addr, ids));
1240         }
1241     }
1242 
close(&mut self)1243     fn close(&mut self) {
1244         if let Some(msi) = self.msi_cap.as_mut() {
1245             msi.destroy();
1246         }
1247         if let Some(msix) = &self.msix_cap {
1248             msix.lock().destroy();
1249         }
1250         self.disable_bars_mmap();
1251         self.device.close();
1252     }
1253 
start_work_thread(&mut self)1254     fn start_work_thread(&mut self) {
1255         let vm_socket = match self.vm_socket_vm.take() {
1256             Some(socket) => socket,
1257             None => return,
1258         };
1259 
1260         let req_evt = match Event::new() {
1261             Ok(evt) => {
1262                 if let Err(e) = self
1263                     .device
1264                     .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1265                 {
1266                     error!("{} enable req_irq failed: {}", self.debug_label(), e);
1267                     return;
1268                 }
1269                 evt
1270             }
1271             Err(_) => return,
1272         };
1273 
1274         let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1275             Ok(v) => v,
1276             Err(e) => {
1277                 error!(
1278                     "{} failed creating PM Event pair: {}",
1279                     self.debug_label(),
1280                     e
1281                 );
1282                 return;
1283             }
1284         };
1285         self.pm_evt = Some(self_pm_evt);
1286 
1287         let (self_acpi_notify_evt, acpi_notify_evt) =
1288             match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1289                 Ok(v) => v,
1290                 Err(e) => {
1291                     error!(
1292                         "{} failed creating ACPI Event pair: {}",
1293                         self.debug_label(),
1294                         e
1295                     );
1296                     return;
1297                 }
1298             };
1299         self.acpi_notification_evt = Some(self_acpi_notify_evt);
1300 
1301         if let Err(e) = self.enable_acpi_notification() {
1302             error!("{}: {}", self.debug_label(), e);
1303         }
1304 
1305         let mut msix_evt = Vec::new();
1306         if let Some(msix_cap) = &self.msix_cap {
1307             msix_evt = msix_cap.lock().clone_msix_evt();
1308         }
1309 
1310         let name = self.device.device_name().to_string();
1311         let address = self.pci_address.expect("Unassigned PCI Address.");
1312         let sysfs_path = self.sysfs_path.clone();
1313         let pm_cap = self.pm_cap.clone();
1314         let msix_cap = self.msix_cap.clone();
1315         let is_in_low_power = self.is_in_low_power.clone();
1316         let gpe_nr = self.gpe;
1317         let notification_val = self.acpi_notifier_val.clone();
1318         self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1319             let mut worker = VfioPciWorker {
1320                 address,
1321                 sysfs_path,
1322                 vm_socket,
1323                 name,
1324                 pm_cap,
1325                 msix_cap,
1326             };
1327             worker.run(
1328                 req_evt,
1329                 pm_evt,
1330                 acpi_notify_evt,
1331                 kill_evt,
1332                 msix_evt,
1333                 is_in_low_power,
1334                 gpe_nr,
1335                 notification_val,
1336             );
1337             worker
1338         }));
1339         self.activated = true;
1340     }
1341 
collect_bars(&mut self) -> Vec<PciBarConfiguration>1342     fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1343         let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1344         let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1345 
1346         while i <= VFIO_PCI_ROM_REGION_INDEX {
1347             let mut low: u32 = 0xffffffff;
1348             let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1349                 0x30
1350             } else {
1351                 0x10 + i * 4
1352             };
1353             self.config.write_config(low, offset);
1354             low = self.config.read_config(offset);
1355 
1356             let low_flag = low & 0xf;
1357             let is_64bit = low_flag & 0x4 == 0x4;
1358             if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1359                 let mut upper: u32 = 0xffffffff;
1360                 if is_64bit {
1361                     self.config.write_config(upper, offset + 4);
1362                     upper = self.config.read_config(offset + 4);
1363                 }
1364 
1365                 low &= 0xffff_fff0;
1366                 let mut size: u64 = u64::from(upper);
1367                 size <<= 32;
1368                 size |= u64::from(low);
1369                 size = !size + 1;
1370                 let region_type = if is_64bit {
1371                     PciBarRegionType::Memory64BitRegion
1372                 } else {
1373                     PciBarRegionType::Memory32BitRegion
1374                 };
1375                 let prefetch = if low_flag & 0x8 == 0x8 {
1376                     PciBarPrefetchable::Prefetchable
1377                 } else {
1378                     PciBarPrefetchable::NotPrefetchable
1379                 };
1380                 mem_bars.push(PciBarConfiguration::new(
1381                     i as usize,
1382                     size,
1383                     region_type,
1384                     prefetch,
1385                 ));
1386             } else if low_flag & 0x1 == 0x1 {
1387                 let size = !(low & 0xffff_fffc) + 1;
1388                 self.io_regions.push(PciBarConfiguration::new(
1389                     i as usize,
1390                     size.into(),
1391                     PciBarRegionType::IoRegion,
1392                     PciBarPrefetchable::NotPrefetchable,
1393                 ));
1394             }
1395 
1396             if is_64bit {
1397                 i += 2;
1398             } else {
1399                 i += 1;
1400             }
1401         }
1402         mem_bars
1403     }
1404 
configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64)1405     fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1406         let offset: u32 = bar_info.reg_index() as u32 * 4;
1407         let mmio_region = *bar_info;
1408         self.mmio_regions.push(mmio_region.set_address(bar_addr));
1409 
1410         let val: u32 = self.config.read_config(offset);
1411         let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1412         self.config.write_config(low, offset);
1413         if bar_info.is_64bit_memory() {
1414             let upper = (bar_addr >> 32) as u32;
1415             self.config.write_config(upper, offset + 4);
1416         }
1417     }
1418 
allocate_root_barmem( &mut self, mem_bars: &[PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1419     fn allocate_root_barmem(
1420         &mut self,
1421         mem_bars: &[PciBarConfiguration],
1422         resources: &mut SystemAllocator,
1423     ) -> Result<Vec<BarRange>, PciDeviceError> {
1424         let address = self.pci_address.unwrap();
1425         let mut ranges: Vec<BarRange> = Vec::new();
1426         for mem_bar in mem_bars {
1427             let bar_size = mem_bar.size();
1428             let mut bar_addr: u64 = 0;
1429             // Don't allocate mmio for hotplug device, OS will allocate it from
1430             // its parent's bridge window.
1431             if !self.hotplug {
1432                 bar_addr = resources
1433                     .allocate_mmio(
1434                         bar_size,
1435                         Alloc::PciBar {
1436                             bus: address.bus,
1437                             dev: address.dev,
1438                             func: address.func,
1439                             bar: mem_bar.bar_index() as u8,
1440                         },
1441                         "vfio_bar".to_string(),
1442                         AllocOptions::new()
1443                             .prefetchable(mem_bar.is_prefetchable())
1444                             .max_address(if mem_bar.is_64bit_memory() {
1445                                 u64::MAX
1446                             } else {
1447                                 u32::MAX.into()
1448                             })
1449                             .align(bar_size),
1450                     )
1451                     .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1452                 ranges.push(BarRange {
1453                     addr: bar_addr,
1454                     size: bar_size,
1455                     prefetchable: mem_bar.is_prefetchable(),
1456                 });
1457             }
1458             self.configure_barmem(mem_bar, bar_addr);
1459         }
1460         Ok(ranges)
1461     }
1462 
allocate_nonroot_barmem( &mut self, mem_bars: &mut [PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1463     fn allocate_nonroot_barmem(
1464         &mut self,
1465         mem_bars: &mut [PciBarConfiguration],
1466         resources: &mut SystemAllocator,
1467     ) -> Result<Vec<BarRange>, PciDeviceError> {
1468         const NON_PREFETCHABLE: usize = 0;
1469         const PREFETCHABLE: usize = 1;
1470         const ARRAY_SIZE: usize = 2;
1471         let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1472         let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1473             match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1474                 Ok(a) => a,
1475                 Err(e) => {
1476                     error!(
1477                         "{} init nonroot VfioResourceAllocator failed: {}",
1478                         self.debug_label(),
1479                         e
1480                     );
1481                     return Err(e);
1482                 }
1483             },
1484             match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1485                 Ok(a) => a,
1486                 Err(e) => {
1487                     error!(
1488                         "{} init nonroot VfioResourceAllocator failed: {}",
1489                         self.debug_label(),
1490                         e
1491                     );
1492                     return Err(e);
1493                 }
1494             },
1495         ];
1496         let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1497         // the window must be 1M-aligned as per the PCI spec
1498         let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1499         let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1500 
1501         // Descend by bar size, this could reduce allocated size for all the bars.
1502         mem_bars.sort_by_key(|a| Reverse(a.size()));
1503         for mem_bar in mem_bars {
1504             let prefetchable = mem_bar.is_prefetchable();
1505             let is_64bit = mem_bar.is_64bit_memory();
1506 
1507             // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1508             // as all the prefetchable bars should be in one region
1509             if prefetchable && !is_64bit {
1510                 memtype[PREFETCHABLE] = MmioType::Low;
1511             }
1512             let i = if prefetchable {
1513                 PREFETCHABLE
1514             } else {
1515                 NON_PREFETCHABLE
1516             };
1517             let bar_size = mem_bar.size();
1518             let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1519                 Ok(s) => s,
1520                 Err(e) => {
1521                     error!(
1522                         "{} nonroot allocate_wit_align failed: {}",
1523                         self.debug_label(),
1524                         e
1525                     );
1526                     return Err(e);
1527                 }
1528             };
1529             window_sz[i] = max(window_sz[i], start + bar_size);
1530             alignment[i] = max(alignment[i], bar_size);
1531             let mem_info = (*mem_bar).set_address(start);
1532             membars[i].push(mem_info);
1533         }
1534 
1535         let address = self.pci_address.unwrap();
1536         let mut ranges: Vec<BarRange> = Vec::new();
1537         for (index, bars) in membars.iter().enumerate() {
1538             if bars.is_empty() {
1539                 continue;
1540             }
1541 
1542             let i = if index == 1 {
1543                 PREFETCHABLE
1544             } else {
1545                 NON_PREFETCHABLE
1546             };
1547             let mut window_addr: u64 = 0;
1548             // Don't allocate mmio for hotplug device, OS will allocate it from
1549             // its parent's bridge window.
1550             if !self.hotplug {
1551                 window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1552                 let alloc = if i == NON_PREFETCHABLE {
1553                     Alloc::PciBridgeWindow {
1554                         bus: address.bus,
1555                         dev: address.dev,
1556                         func: address.func,
1557                     }
1558                 } else {
1559                     Alloc::PciBridgePrefetchWindow {
1560                         bus: address.bus,
1561                         dev: address.dev,
1562                         func: address.func,
1563                     }
1564                 };
1565                 window_addr = resources
1566                     .mmio_allocator(memtype[i])
1567                     .allocate_with_align(
1568                         window_sz[i],
1569                         alloc,
1570                         "vfio_bar_window".to_string(),
1571                         alignment[i],
1572                     )
1573                     .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1574                 for mem_info in bars {
1575                     let bar_addr = window_addr + mem_info.address();
1576                     ranges.push(BarRange {
1577                         addr: bar_addr,
1578                         size: mem_info.size(),
1579                         prefetchable: mem_info.is_prefetchable(),
1580                     });
1581                 }
1582             }
1583 
1584             for mem_info in bars {
1585                 let bar_addr = window_addr + mem_info.address();
1586                 self.configure_barmem(mem_info, bar_addr);
1587             }
1588         }
1589         Ok(ranges)
1590     }
1591 
1592     /// Return the supported iova max address of the Vfio Pci device
get_max_iova(&self) -> u641593     pub fn get_max_iova(&self) -> u64 {
1594         self.device.get_max_addr()
1595     }
1596 
get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap>1597     fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1598         self.ext_caps
1599             .iter()
1600             .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1601             .cloned()
1602     }
1603 
is_skipped_reg(&self, reg: u32) -> bool1604     fn is_skipped_reg(&self, reg: u32) -> bool {
1605         // fast handle for pci config space
1606         if reg < PCI_CONFIG_SPACE_SIZE {
1607             return false;
1608         }
1609 
1610         self.get_ext_cap_by_reg(reg)
1611             .is_some_and(|cap| cap.is_skipped)
1612     }
1613 }
1614 
1615 impl PciDevice for VfioPciDevice {
debug_label(&self) -> String1616     fn debug_label(&self) -> String {
1617         format!("vfio {} device", self.device.device_name())
1618     }
1619 
preferred_address(&self) -> Option<PciAddress>1620     fn preferred_address(&self) -> Option<PciAddress> {
1621         Some(self.preferred_address)
1622     }
1623 
allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>1624     fn allocate_address(
1625         &mut self,
1626         resources: &mut SystemAllocator,
1627     ) -> Result<PciAddress, PciDeviceError> {
1628         if self.pci_address.is_none() {
1629             let mut address = self.preferred_address;
1630             while address.func < 8 {
1631                 if resources.reserve_pci(address, self.debug_label()) {
1632                     self.pci_address = Some(address);
1633                     break;
1634                 } else if self.hotplug_bus_number.is_none() {
1635                     break;
1636                 } else {
1637                     address.func += 1;
1638                 }
1639             }
1640         }
1641         self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1642     }
1643 
keep_rds(&self) -> Vec<RawDescriptor>1644     fn keep_rds(&self) -> Vec<RawDescriptor> {
1645         let mut rds = self.device.keep_rds();
1646         if let Some(ref interrupt_evt) = self.interrupt_evt {
1647             rds.extend(interrupt_evt.as_raw_descriptors());
1648         }
1649         rds.push(self.vm_memory_client.as_raw_descriptor());
1650         if let Some(vm_socket_vm) = &self.vm_socket_vm {
1651             rds.push(vm_socket_vm.as_raw_descriptor());
1652         }
1653         if let Some(msi_cap) = &self.msi_cap {
1654             rds.push(msi_cap.config.get_msi_socket());
1655         }
1656         if let Some(msix_cap) = &self.msix_cap {
1657             rds.push(msix_cap.lock().config.as_raw_descriptor());
1658         }
1659         rds
1660     }
1661 
preferred_irq(&self) -> PreferredIrq1662     fn preferred_irq(&self) -> PreferredIrq {
1663         // Is INTx configured?
1664         let pin = match self.config.read_config::<u8>(PCI_INTERRUPT_PIN) {
1665             1 => PciInterruptPin::IntA,
1666             2 => PciInterruptPin::IntB,
1667             3 => PciInterruptPin::IntC,
1668             4 => PciInterruptPin::IntD,
1669             _ => return PreferredIrq::None,
1670         };
1671 
1672         // TODO: replace sysfs/irq value parsing with vfio interface
1673         //       reporting host allocated interrupt number and type.
1674         let path = self.sysfs_path.join("irq");
1675         let gsi = fs::read_to_string(path)
1676             .map(|v| v.trim().parse::<u32>().unwrap_or(0))
1677             .unwrap_or(0);
1678 
1679         PreferredIrq::Fixed { pin, gsi }
1680     }
1681 
assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32)1682     fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1683         // Keep event/resample event references.
1684         self.interrupt_evt = Some(irq_evt);
1685 
1686         // enable INTX
1687         self.enable_intx();
1688 
1689         self.config
1690             .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1691         self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1692     }
1693 
allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1694     fn allocate_io_bars(
1695         &mut self,
1696         resources: &mut SystemAllocator,
1697     ) -> Result<Vec<BarRange>, PciDeviceError> {
1698         let address = self
1699             .pci_address
1700             .expect("allocate_address must be called prior to allocate_device_bars");
1701 
1702         let mut mem_bars = self.collect_bars();
1703 
1704         let ranges = if address.bus == 0 {
1705             self.allocate_root_barmem(&mem_bars, resources)?
1706         } else {
1707             self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1708         };
1709 
1710         // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1711         // driver doesn't claim this vga device, then xorg couldn't boot up.
1712         if self.is_intel_gfx() {
1713             let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1714             cmd |= PCI_COMMAND_MEMORY;
1715             self.config.write_config(cmd, PCI_COMMAND);
1716         }
1717         Ok(ranges)
1718     }
1719 
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1720     fn allocate_device_bars(
1721         &mut self,
1722         resources: &mut SystemAllocator,
1723     ) -> Result<Vec<BarRange>, PciDeviceError> {
1724         let mut ranges: Vec<BarRange> = Vec::new();
1725 
1726         if !self.is_intel_gfx() {
1727             return Ok(ranges);
1728         }
1729 
1730         // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1731         // then write this gpa into pci cfg register
1732         if let Some((index, size)) = self.device.get_cap_type_info(
1733             VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1734             VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1735         ) {
1736             let address = self
1737                 .pci_address
1738                 .expect("allocate_address must be called prior to allocate_device_bars");
1739             let bar_addr = resources
1740                 .allocate_mmio(
1741                     size,
1742                     Alloc::PciBar {
1743                         bus: address.bus,
1744                         dev: address.dev,
1745                         func: address.func,
1746                         bar: (index * 4) as u8,
1747                     },
1748                     "vfio_bar".to_string(),
1749                     AllocOptions::new().max_address(u32::MAX.into()),
1750                 )
1751                 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1752             ranges.push(BarRange {
1753                 addr: bar_addr,
1754                 size,
1755                 prefetchable: false,
1756             });
1757             self.device_data = Some(DeviceData::IntelGfxData {
1758                 opregion_index: index,
1759             });
1760 
1761             self.mmio_regions.push(
1762                 PciBarConfiguration::new(
1763                     index as usize,
1764                     size,
1765                     PciBarRegionType::Memory32BitRegion,
1766                     PciBarPrefetchable::NotPrefetchable,
1767                 )
1768                 .set_address(bar_addr),
1769             );
1770             self.config.write_config(bar_addr as u32, 0xFC);
1771         }
1772 
1773         Ok(ranges)
1774     }
1775 
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1776     fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1777         for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1778             if region.bar_index() == bar_num {
1779                 let command: u8 = self.config.read_config(PCI_COMMAND);
1780                 if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1781                     return None;
1782                 } else {
1783                     return Some(*region);
1784                 }
1785             }
1786         }
1787 
1788         None
1789     }
1790 
register_device_capabilities(&mut self) -> Result<(), PciDeviceError>1791     fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1792         Ok(())
1793     }
1794 
read_config_register(&self, reg_idx: usize) -> u321795     fn read_config_register(&self, reg_idx: usize) -> u32 {
1796         let reg: u32 = (reg_idx * 4) as u32;
1797         let mut config: u32 = self.config.read_config(reg);
1798 
1799         // See VfioPciDevice::new for details how extended caps are managed
1800         if reg >= PCI_CONFIG_SPACE_SIZE {
1801             let ext_cap = self.get_ext_cap_by_reg(reg);
1802             if let Some(ext_cap) = ext_cap {
1803                 if ext_cap.offset == reg {
1804                     config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1805                 }
1806 
1807                 if ext_cap.is_skipped {
1808                     if reg == PCI_CONFIG_SPACE_SIZE {
1809                         config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1810                     } else {
1811                         config = 0;
1812                     }
1813                 }
1814             }
1815         }
1816 
1817         // Ignore IO bar
1818         if (0x10..=0x24).contains(&reg) {
1819             let bar_idx = (reg as usize - 0x10) / 4;
1820             if let Some(bar) = self.get_bar_configuration(bar_idx) {
1821                 if bar.is_io() {
1822                     config = 0;
1823                 }
1824             }
1825         } else if let Some(msix_cap) = &self.msix_cap {
1826             let msix_cap = msix_cap.lock();
1827             if msix_cap.is_msix_control_reg(reg, 4) {
1828                 msix_cap.read_msix_control(&mut config);
1829             }
1830         } else if let Some(pm_cap) = &self.pm_cap {
1831             let pm_cap = pm_cap.lock();
1832             if pm_cap.is_pm_reg(reg) {
1833                 config = pm_cap.read(reg);
1834             }
1835         }
1836 
1837         // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1838         if self.is_intel_gfx() && reg == 0x50 {
1839             config &= 0xffff00ff;
1840         }
1841 
1842         config
1843     }
1844 
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1845     fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1846         // When guest write config register at the first time, start worker thread
1847         if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1848             self.start_work_thread();
1849         };
1850 
1851         let start = (reg_idx * 4) as u64 + offset;
1852 
1853         if let Some(pm_cap) = self.pm_cap.as_mut() {
1854             let mut pm_cap = pm_cap.lock();
1855             if pm_cap.is_pm_reg(start as u32) {
1856                 pm_cap.write(start, data);
1857             }
1858         }
1859 
1860         let mut msi_change: Option<VfioMsiChange> = None;
1861         if let Some(msi_cap) = self.msi_cap.as_mut() {
1862             if msi_cap.is_msi_reg(start, data.len()) {
1863                 msi_change = msi_cap.write_msi_reg(start, data);
1864             }
1865         }
1866 
1867         match msi_change {
1868             Some(VfioMsiChange::Enable) => self.enable_msi(),
1869             Some(VfioMsiChange::Disable) => self.disable_msi(),
1870             _ => (),
1871         }
1872 
1873         msi_change = None;
1874         if let Some(msix_cap) = &self.msix_cap {
1875             let mut msix_cap = msix_cap.lock();
1876             if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
1877                 msi_change = msix_cap.write_msix_control(data);
1878             }
1879         }
1880 
1881         match msi_change {
1882             Some(VfioMsiChange::Enable) => self.enable_msix(),
1883             Some(VfioMsiChange::Disable) => self.disable_msix(),
1884             Some(VfioMsiChange::FunctionChanged) => {
1885                 if let Err(e) = self.msix_vectors_update() {
1886                     error!("update msix vectors failed: {}", e);
1887                 }
1888             }
1889             _ => (),
1890         }
1891 
1892         if !self.is_skipped_reg(start as u32) {
1893             self.device
1894                 .region_write(VFIO_PCI_CONFIG_REGION_INDEX as usize, data, start);
1895         }
1896 
1897         // if guest enable memory access, then enable bar mappable once
1898         if start == PCI_COMMAND as u64
1899             && data.len() == 2
1900             && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
1901         {
1902             self.commit_bars_mmap();
1903         } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
1904             let bar_idx = (start as u32 - 0x10) / 4;
1905             let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
1906             let val = u32::from_le_bytes(value);
1907             let mut modify = false;
1908             for region in self.mmio_regions.iter_mut() {
1909                 if region.bar_index() == bar_idx as usize {
1910                     let old_addr = region.address();
1911                     let new_addr = val & 0xFFFFFFF0;
1912                     if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
1913                         // Change 32bit bar address
1914                         *region = region.set_address(u64::from(new_addr));
1915                         modify = true;
1916                     } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
1917                         // Change 64bit bar low address
1918                         *region =
1919                             region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
1920                         modify = true;
1921                     }
1922                     break;
1923                 } else if region.is_64bit_memory()
1924                     && ((bar_idx % 2) == 1)
1925                     && (region.bar_index() + 1 == bar_idx as usize)
1926                 {
1927                     // Change 64bit bar high address
1928                     let old_addr = region.address();
1929                     if val != (old_addr >> 32) as u32 {
1930                         let mut new_addr = (u64::from(val)) << 32;
1931                         new_addr |= old_addr & 0xFFFFFFFF;
1932                         *region = region.set_address(new_addr);
1933                         modify = true;
1934                     }
1935                     break;
1936                 }
1937             }
1938             if modify {
1939                 // if bar is changed under memory enabled, mmap the
1940                 // new bar immediately.
1941                 let cmd = self.config.read_config::<u8>(PCI_COMMAND);
1942                 if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
1943                     self.commit_bars_mmap();
1944                 }
1945             }
1946         }
1947     }
1948 
read_virtual_config_register(&self, reg_idx: usize) -> u321949     fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
1950         if reg_idx == PCI_VCFG_NOTY {
1951             let mut q = self.acpi_notifier_val.lock();
1952             let mut val = 0;
1953             if !q.is_empty() {
1954                 val = q.remove(0);
1955             }
1956             drop(q);
1957             return val;
1958         }
1959 
1960         warn!(
1961             "{} read unsupported vcfg register {}",
1962             self.debug_label(),
1963             reg_idx
1964         );
1965         0xFFFF_FFFF
1966     }
1967 
write_virtual_config_register(&mut self, reg_idx: usize, value: u32)1968     fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
1969         match reg_idx {
1970             PCI_VCFG_PM => {
1971                 match value {
1972                     0 => {
1973                         if let Some(pm_evt) =
1974                             self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
1975                         {
1976                             *self.is_in_low_power.lock() = true;
1977                             let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
1978                         } else {
1979                             let _ = self.device.pm_low_power_enter();
1980                         }
1981                     }
1982                     _ => {
1983                         *self.is_in_low_power.lock() = false;
1984                         let _ = self.device.pm_low_power_exit();
1985                     }
1986                 };
1987             }
1988             PCI_VCFG_DSM => {
1989                 if let Some(shm) = &self.vcfg_shm_mmap {
1990                     let mut args = [0u8; 4096];
1991                     if let Err(e) = shm.read_slice(&mut args, 0) {
1992                         error!("failed to read DSM Args: {}", e);
1993                         return;
1994                     }
1995                     let res = match self.device.acpi_dsm(&args) {
1996                         Ok(r) => r,
1997                         Err(e) => {
1998                             error!("failed to call DSM: {}", e);
1999                             return;
2000                         }
2001                     };
2002                     if let Err(e) = shm.write_slice(&res, 0) {
2003                         error!("failed to write DSM result: {}", e);
2004                         return;
2005                     }
2006                     if let Err(e) = shm.msync() {
2007                         error!("failed to msync: {}", e)
2008                     }
2009                 }
2010             }
2011             _ => warn!(
2012                 "{} write unsupported vcfg register {}",
2013                 self.debug_label(),
2014                 reg_idx
2015             ),
2016         };
2017     }
2018 
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])2019     fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
2020         if let Some(msix_cap) = &self.msix_cap {
2021             let msix_cap = msix_cap.lock();
2022             if msix_cap.is_msix_table(bar_index, offset) {
2023                 msix_cap.read_table(offset, data);
2024                 return;
2025             } else if msix_cap.is_msix_pba(bar_index, offset) {
2026                 msix_cap.read_pba(offset, data);
2027                 return;
2028             }
2029         }
2030         self.device.region_read(bar_index, data, offset);
2031     }
2032 
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])2033     fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
2034         // Ignore igd opregion's write
2035         if let Some(device_data) = &self.device_data {
2036             match *device_data {
2037                 DeviceData::IntelGfxData { opregion_index } => {
2038                     if opregion_index == bar_index as u32 {
2039                         return;
2040                     }
2041                 }
2042             }
2043         }
2044 
2045         if let Some(msix_cap) = &self.msix_cap {
2046             let mut msix_cap = msix_cap.lock();
2047             if msix_cap.is_msix_table(bar_index, offset) {
2048                 let behavior = msix_cap.write_table(offset, data);
2049                 if let MsixStatus::EntryChanged(index) = behavior {
2050                     let irqfd = msix_cap.get_msix_irqfd(index);
2051                     self.msix_vector_update(index, irqfd);
2052                 }
2053                 return;
2054             } else if msix_cap.is_msix_pba(bar_index, offset) {
2055                 msix_cap.write_pba(offset, data);
2056                 return;
2057             }
2058         }
2059 
2060         self.device.region_write(bar_index, data, offset);
2061     }
2062 
destroy_device(&mut self)2063     fn destroy_device(&mut self) {
2064         self.close();
2065     }
2066 
generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>)2067     fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2068         let mut amls = Vec::new();
2069         let mut shm = None;
2070         if let Some(pci_address) = self.pci_address {
2071             let vcfg_offset = pci_address.to_config_address(0, 13);
2072             if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2073                 vcfg_register.to_aml_bytes(&mut amls);
2074                 shm = vcfg_register
2075                     .create_shm_mmap()
2076                     .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2077                 self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2078                 // All vfio-pci devices should have virtual _PRx method, otherwise
2079                 // host couldn't know whether device has enter into suspend state,
2080                 // host would always think it is in active state, so its parent PCIe
2081                 // switch couldn't enter into suspend state.
2082                 PowerResourceMethod {}.to_aml_bytes(&mut amls);
2083                 // TODO: WIP: Ideally, we should generate DSM only if the physical
2084                 // device has a _DSM; however, such information is not provided by
2085                 // Linux. As a temporary workaround, we chech whether there is an
2086                 // associated ACPI companion device node and skip generating guest
2087                 // _DSM if there is none.
2088                 let acpi_path = self.sysfs_path.join("firmware_node/path");
2089                 if acpi_path.exists() {
2090                     DsmMethod {}.to_aml_bytes(&mut amls);
2091                 }
2092             }
2093         }
2094 
2095         (amls, shm)
2096     }
2097 
set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32>2098     fn set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32> {
2099         if let Some(gpe_nr) = resources.allocate_gpe() {
2100             base::debug!("set_gpe: gpe-nr {} addr {:?}", gpe_nr, self.pci_address);
2101             self.gpe = Some(gpe_nr);
2102         }
2103         self.gpe
2104     }
2105 }
2106 
2107 impl Suspendable for VfioPciDevice {
sleep(&mut self) -> anyhow::Result<()>2108     fn sleep(&mut self) -> anyhow::Result<()> {
2109         if let Some(worker_thread) = self.worker_thread.take() {
2110             let res = worker_thread.stop();
2111             self.pci_address = Some(res.address);
2112             self.sysfs_path = res.sysfs_path;
2113             self.pm_cap = res.pm_cap;
2114             self.msix_cap = res.msix_cap;
2115             self.vm_socket_vm = Some(res.vm_socket);
2116         }
2117         Ok(())
2118     }
2119 
wake(&mut self) -> anyhow::Result<()>2120     fn wake(&mut self) -> anyhow::Result<()> {
2121         if self.activated {
2122             self.start_work_thread();
2123         }
2124         Ok(())
2125     }
2126 }
2127 
2128 #[cfg(test)]
2129 mod tests {
2130     use resources::AddressRange;
2131 
2132     use super::VfioResourceAllocator;
2133 
2134     #[test]
no_overlap()2135     fn no_overlap() {
2136         // regions [32, 95]
2137         let mut memory =
2138             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2139         memory
2140             .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2141             .unwrap();
2142         memory
2143             .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2144             .unwrap();
2145 
2146         let mut iter = memory.regions.iter();
2147         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2148     }
2149 
2150     #[test]
complete_overlap()2151     fn complete_overlap() {
2152         // regions [32, 95]
2153         let mut memory =
2154             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2155         // regions [32, 47], [64, 95]
2156         memory
2157             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2158             .unwrap();
2159         // regions [64, 95]
2160         memory
2161             .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2162             .unwrap();
2163 
2164         let mut iter = memory.regions.iter();
2165         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2166     }
2167 
2168     #[test]
partial_overlap_one()2169     fn partial_overlap_one() {
2170         // regions [32, 95]
2171         let mut memory =
2172             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2173         // regions [32, 47], [64, 95]
2174         memory
2175             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2176             .unwrap();
2177         // regions [32, 39], [64, 95]
2178         memory
2179             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2180             .unwrap();
2181 
2182         let mut iter = memory.regions.iter();
2183         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2184         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2185     }
2186 
2187     #[test]
partial_overlap_two()2188     fn partial_overlap_two() {
2189         // regions [32, 95]
2190         let mut memory =
2191             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2192         // regions [32, 47], [64, 95]
2193         memory
2194             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2195             .unwrap();
2196         // regions [32, 39], [72, 95]
2197         memory
2198             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2199             .unwrap();
2200 
2201         let mut iter = memory.regions.iter();
2202         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2203         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2204     }
2205 
2206     #[test]
partial_overlap_three()2207     fn partial_overlap_three() {
2208         // regions [32, 95]
2209         let mut memory =
2210             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2211         // regions [32, 39], [48, 95]
2212         memory
2213             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2214             .unwrap();
2215         // regions [32, 39], [48, 63], [72, 95]
2216         memory
2217             .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2218             .unwrap();
2219         // regions [32, 35], [76, 95]
2220         memory
2221             .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2222             .unwrap();
2223 
2224         let mut iter = memory.regions.iter();
2225         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2226         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2227     }
2228 }
2229