• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::cmp::max;
6 use std::cmp::Reverse;
7 use std::collections::BTreeMap;
8 use std::collections::BTreeSet;
9 #[cfg(feature = "direct")]
10 use std::collections::HashMap;
11 use std::fs;
12 use std::path::Path;
13 use std::path::PathBuf;
14 use std::str::FromStr;
15 use std::sync::Arc;
16 use std::u32;
17 
18 use acpi_tables::aml::Aml;
19 #[cfg(feature = "direct")]
20 use anyhow::Context;
21 use base::debug;
22 use base::error;
23 use base::pagesize;
24 use base::warn;
25 use base::AsRawDescriptor;
26 use base::AsRawDescriptors;
27 use base::Event;
28 use base::EventToken;
29 use base::MemoryMapping;
30 use base::Protection;
31 use base::RawDescriptor;
32 use base::Tube;
33 use base::WaitContext;
34 use base::WorkerThread;
35 use hypervisor::MemSlot;
36 use resources::AddressRange;
37 use resources::Alloc;
38 use resources::AllocOptions;
39 use resources::MmioType;
40 use resources::SystemAllocator;
41 use sync::Mutex;
42 use vfio_sys::*;
43 use vm_control::HotPlugDeviceInfo;
44 use vm_control::HotPlugDeviceType;
45 use vm_control::VmMemoryDestination;
46 use vm_control::VmMemoryRequest;
47 use vm_control::VmMemoryResponse;
48 use vm_control::VmMemorySource;
49 use vm_control::VmRequest;
50 use vm_control::VmResponse;
51 
52 use crate::pci::acpi::DeviceVcfgRegister;
53 use crate::pci::acpi::PowerResourceMethod;
54 use crate::pci::acpi::SHM_OFFSET;
55 use crate::pci::msi::MsiConfig;
56 use crate::pci::msi::MsiStatus;
57 use crate::pci::msi::PCI_MSI_FLAGS;
58 use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
59 use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
60 use crate::pci::msi::PCI_MSI_NEXT_POINTER;
61 use crate::pci::msix::MsixConfig;
62 use crate::pci::msix::MsixStatus;
63 use crate::pci::msix::BITS_PER_PBA_ENTRY;
64 use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
65 use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
66 #[cfg(feature = "direct")]
67 use crate::pci::pci_configuration::CLASS_REG;
68 #[cfg(feature = "direct")]
69 use crate::pci::pci_configuration::CLASS_REG_REVISION_ID_OFFSET;
70 #[cfg(feature = "direct")]
71 use crate::pci::pci_configuration::HEADER_TYPE_REG;
72 use crate::pci::pci_device::BarRange;
73 use crate::pci::pci_device::Error as PciDeviceError;
74 use crate::pci::pci_device::PciDevice;
75 use crate::pci::pci_device::PreferredIrq;
76 use crate::pci::pm::PciPmCap;
77 use crate::pci::pm::PmConfig;
78 use crate::pci::pm::PM_CAP_LENGTH;
79 use crate::pci::PciAddress;
80 use crate::pci::PciBarConfiguration;
81 use crate::pci::PciBarIndex;
82 use crate::pci::PciBarPrefetchable;
83 use crate::pci::PciBarRegionType;
84 use crate::pci::PciCapabilityID;
85 use crate::pci::PciClassCode;
86 use crate::pci::PciId;
87 use crate::pci::PciInterruptPin;
88 use crate::pci::PCI_VENDOR_ID_INTEL;
89 use crate::vfio::VfioDevice;
90 use crate::vfio::VfioError;
91 use crate::vfio::VfioIrqType;
92 use crate::vfio::VfioPciConfig;
93 use crate::IrqLevelEvent;
94 use crate::Suspendable;
95 
96 const PCI_VENDOR_ID: u32 = 0x0;
97 const PCI_DEVICE_ID: u32 = 0x2;
98 const PCI_COMMAND: u32 = 0x4;
99 const PCI_COMMAND_MEMORY: u8 = 0x2;
100 const PCI_BASE_CLASS_CODE: u32 = 0x0B;
101 const PCI_INTERRUPT_NUM: u32 = 0x3C;
102 const PCI_INTERRUPT_PIN: u32 = 0x3D;
103 
104 const PCI_CAPABILITY_LIST: u32 = 0x34;
105 const PCI_CAP_ID_MSI: u8 = 0x05;
106 const PCI_CAP_ID_MSIX: u8 = 0x11;
107 const PCI_CAP_ID_PM: u8 = 0x01;
108 
109 // Size of the standard PCI config space
110 const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
111 // Size of the standard PCIe config space: 4KB
112 const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
113 
114 // Extended Capabilities
115 const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
116 const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
117 const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
118 const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
119 
120 #[cfg(feature = "direct")]
121 const LPSS_MANATEE_OFFSET: u64 = 0x400;
122 #[cfg(feature = "direct")]
123 const LPSS_MANATEE_SIZE: u64 = 0x400;
124 
125 struct VfioPmCap {
126     offset: u32,
127     capabilities: u32,
128     config: PmConfig,
129 }
130 
131 impl VfioPmCap {
new(config: &VfioPciConfig, cap_start: u32) -> Self132     fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
133         let mut capabilities: u32 = config.read_config(cap_start);
134         capabilities |= (PciPmCap::default_cap() as u32) << 16;
135         VfioPmCap {
136             offset: cap_start,
137             capabilities,
138             config: PmConfig::new(),
139         }
140     }
141 
should_trigger_pme(&mut self) -> bool142     pub fn should_trigger_pme(&mut self) -> bool {
143         self.config.should_trigger_pme()
144     }
145 
is_pm_reg(&self, offset: u32) -> bool146     fn is_pm_reg(&self, offset: u32) -> bool {
147         (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
148     }
149 
read(&self, offset: u32) -> u32150     pub fn read(&self, offset: u32) -> u32 {
151         let offset = offset - self.offset;
152         if offset == 0 {
153             self.capabilities
154         } else {
155             let mut data = 0;
156             self.config.read(&mut data);
157             data
158         }
159     }
160 
write(&mut self, offset: u64, data: &[u8])161     pub fn write(&mut self, offset: u64, data: &[u8]) {
162         let offset = offset - self.offset as u64;
163         if offset >= std::mem::size_of::<u32>() as u64 {
164             let offset = offset - std::mem::size_of::<u32>() as u64;
165             self.config.write(offset, data);
166         }
167     }
168 }
169 
170 enum VfioMsiChange {
171     Disable,
172     Enable,
173     FunctionChanged,
174 }
175 
176 struct VfioMsiCap {
177     config: MsiConfig,
178     offset: u32,
179 }
180 
181 impl VfioMsiCap {
new( config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube, device_id: u32, device_name: String, ) -> Self182     fn new(
183         config: &VfioPciConfig,
184         msi_cap_start: u32,
185         vm_socket_irq: Tube,
186         device_id: u32,
187         device_name: String,
188     ) -> Self {
189         let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
190         let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
191         let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
192 
193         VfioMsiCap {
194             config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
195             offset: msi_cap_start,
196         }
197     }
198 
is_msi_reg(&self, index: u64, len: usize) -> bool199     fn is_msi_reg(&self, index: u64, len: usize) -> bool {
200         self.config.is_msi_reg(self.offset, index, len)
201     }
202 
write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>203     fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
204         let offset = index as u32 - self.offset;
205         match self.config.write_msi_capability(offset, data) {
206             MsiStatus::Enabled => Some(VfioMsiChange::Enable),
207             MsiStatus::Disabled => Some(VfioMsiChange::Disable),
208             MsiStatus::NothingToDo => None,
209         }
210     }
211 
get_msi_irqfd(&self) -> Option<&Event>212     fn get_msi_irqfd(&self) -> Option<&Event> {
213         self.config.get_irqfd()
214     }
215 
destroy(&mut self)216     fn destroy(&mut self) {
217         self.config.destroy()
218     }
219 }
220 
221 // MSI-X registers in MSI-X capability
222 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
223 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
224 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
225 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
226 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
227 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
228 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
229 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
230 
231 struct VfioMsixCap {
232     config: MsixConfig,
233     offset: u32,
234     table_size: u16,
235     table_pci_bar: u32,
236     table_offset: u64,
237     table_size_bytes: u64,
238     pba_pci_bar: u32,
239     pba_offset: u64,
240     pba_size_bytes: u64,
241     msix_interrupt_evt: Vec<Event>,
242 }
243 
244 impl VfioMsixCap {
new( config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube, pci_id: u32, device_name: String, ) -> Self245     fn new(
246         config: &VfioPciConfig,
247         msix_cap_start: u32,
248         vm_socket_irq: Tube,
249         pci_id: u32,
250         device_name: String,
251     ) -> Self {
252         let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
253         let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
254         let table_pci_bar = table & PCI_MSIX_TABLE_BIR;
255         let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
256         let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
257         let pba_pci_bar = pba & PCI_MSIX_PBA_BIR;
258         let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
259 
260         let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
261         if table_pci_bar == pba_pci_bar
262             && pba_offset > table_offset
263             && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
264         {
265             table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
266         }
267 
268         let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
269         let pba_size_bytes = ((table_size + BITS_PER_PBA_ENTRY as u64 - 1)
270             / BITS_PER_PBA_ENTRY as u64)
271             * MSIX_PBA_ENTRIES_MODULO;
272         let mut msix_interrupt_evt = Vec::new();
273         for _ in 0..table_size {
274             msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
275         }
276         VfioMsixCap {
277             config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
278             offset: msix_cap_start,
279             table_size: table_size as u16,
280             table_pci_bar,
281             table_offset,
282             table_size_bytes,
283             pba_pci_bar,
284             pba_offset,
285             pba_size_bytes,
286             msix_interrupt_evt,
287         }
288     }
289 
290     // only msix control register is writable and need special handle in pci r/w
is_msix_control_reg(&self, offset: u32, size: u32) -> bool291     fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
292         let control_start = self.offset + PCI_MSIX_FLAGS;
293         let control_end = control_start + 2;
294 
295         offset < control_end && offset + size > control_start
296     }
297 
read_msix_control(&self, data: &mut u32)298     fn read_msix_control(&self, data: &mut u32) {
299         *data = self.config.read_msix_capability(*data);
300     }
301 
write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>302     fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
303         let old_enabled = self.config.enabled();
304         let old_masked = self.config.masked();
305 
306         self.config
307             .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
308 
309         let new_enabled = self.config.enabled();
310         let new_masked = self.config.masked();
311 
312         if !old_enabled && new_enabled {
313             Some(VfioMsiChange::Enable)
314         } else if old_enabled && !new_enabled {
315             Some(VfioMsiChange::Disable)
316         } else if new_enabled && old_masked != new_masked {
317             Some(VfioMsiChange::FunctionChanged)
318         } else {
319             None
320         }
321     }
322 
is_msix_table(&self, bar_index: u32, offset: u64) -> bool323     fn is_msix_table(&self, bar_index: u32, offset: u64) -> bool {
324         bar_index == self.table_pci_bar
325             && offset >= self.table_offset
326             && offset < self.table_offset + self.table_size_bytes
327     }
328 
get_msix_table(&self, bar_index: u32) -> Option<AddressRange>329     fn get_msix_table(&self, bar_index: u32) -> Option<AddressRange> {
330         if bar_index == self.table_pci_bar {
331             AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
332         } else {
333             None
334         }
335     }
336 
read_table(&self, offset: u64, data: &mut [u8])337     fn read_table(&self, offset: u64, data: &mut [u8]) {
338         let offset = offset - self.table_offset;
339         self.config.read_msix_table(offset, data);
340     }
341 
write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus342     fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
343         let offset = offset - self.table_offset;
344         self.config.write_msix_table(offset, data)
345     }
346 
is_msix_pba(&self, bar_index: u32, offset: u64) -> bool347     fn is_msix_pba(&self, bar_index: u32, offset: u64) -> bool {
348         bar_index == self.pba_pci_bar
349             && offset >= self.pba_offset
350             && offset < self.pba_offset + self.pba_size_bytes
351     }
352 
get_msix_pba(&self, bar_index: u32) -> Option<AddressRange>353     fn get_msix_pba(&self, bar_index: u32) -> Option<AddressRange> {
354         if bar_index == self.pba_pci_bar {
355             AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
356         } else {
357             None
358         }
359     }
360 
read_pba(&self, offset: u64, data: &mut [u8])361     fn read_pba(&self, offset: u64, data: &mut [u8]) {
362         let offset = offset - self.pba_offset;
363         self.config.read_pba_entries(offset, data);
364     }
365 
write_pba(&mut self, offset: u64, data: &[u8])366     fn write_pba(&mut self, offset: u64, data: &[u8]) {
367         let offset = offset - self.pba_offset;
368         self.config.write_pba_entries(offset, data);
369     }
370 
get_msix_irqfd(&self, index: usize) -> Option<&Event>371     fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
372         let irqfd = self.config.get_irqfd(index);
373         if let Some(fd) = irqfd {
374             if self.msix_vector_masked(index) {
375                 Some(&self.msix_interrupt_evt[index])
376             } else {
377                 Some(fd)
378             }
379         } else {
380             None
381         }
382     }
383 
get_msix_irqfds(&self) -> Vec<Option<&Event>>384     fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
385         let mut irqfds = Vec::new();
386 
387         for i in 0..self.table_size {
388             irqfds.push(self.get_msix_irqfd(i as usize));
389         }
390 
391         irqfds
392     }
393 
table_size(&self) -> usize394     fn table_size(&self) -> usize {
395         self.table_size.into()
396     }
397 
clone_msix_evt(&self) -> Vec<Event>398     fn clone_msix_evt(&self) -> Vec<Event> {
399         self.msix_interrupt_evt
400             .iter()
401             .map(|irq| irq.try_clone().unwrap())
402             .collect()
403     }
404 
msix_vector_masked(&self, index: usize) -> bool405     fn msix_vector_masked(&self, index: usize) -> bool {
406         !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
407     }
408 
trigger(&mut self, index: usize)409     fn trigger(&mut self, index: usize) {
410         self.config.trigger(index as u16);
411     }
412 
destroy(&mut self)413     fn destroy(&mut self) {
414         self.config.destroy()
415     }
416 }
417 
418 struct VfioResourceAllocator {
419     // The region that is not allocated yet.
420     regions: BTreeSet<AddressRange>,
421 }
422 
423 impl VfioResourceAllocator {
424     // Creates a new `VfioResourceAllocator` for managing VFIO resources.
425     // Can return `Err` if `base` + `size` overflows a u64.
426     //
427     // * `base` - The starting address of the range to manage.
428     // * `size` - The size of the address range in bytes.
new(pool: AddressRange) -> Result<Self, PciDeviceError>429     fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
430         if pool.is_empty() {
431             return Err(PciDeviceError::SizeZero);
432         }
433         let mut regions = BTreeSet::new();
434         regions.insert(pool);
435         Ok(VfioResourceAllocator { regions })
436     }
437 
internal_allocate_from_slot( &mut self, slot: AddressRange, range: AddressRange, ) -> Result<u64, PciDeviceError>438     fn internal_allocate_from_slot(
439         &mut self,
440         slot: AddressRange,
441         range: AddressRange,
442     ) -> Result<u64, PciDeviceError> {
443         let slot_was_present = self.regions.remove(&slot);
444         assert!(slot_was_present);
445 
446         let (before, after) = slot.non_overlapping_ranges(range);
447 
448         if !before.is_empty() {
449             self.regions.insert(before);
450         }
451         if !after.is_empty() {
452             self.regions.insert(after);
453         }
454 
455         Ok(range.start)
456     }
457 
458     // Allocates a range of addresses from the managed region with a minimal alignment.
459     // Overlapping with a previous allocation is _not_ allowed.
460     // Returns allocated address.
allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError>461     fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
462         if size == 0 {
463             return Err(PciDeviceError::SizeZero);
464         }
465         if !alignment.is_power_of_two() {
466             return Err(PciDeviceError::BadAlignment);
467         }
468 
469         // finds first region matching alignment and size.
470         let region = self.regions.iter().find(|range| {
471             match range.start % alignment {
472                 0 => range.start.checked_add(size - 1),
473                 r => range.start.checked_add(size - 1 + alignment - r),
474             }
475             .map_or(false, |end| end <= range.end)
476         });
477 
478         match region {
479             Some(&slot) => {
480                 let start = match slot.start % alignment {
481                     0 => slot.start,
482                     r => slot.start + alignment - r,
483                 };
484                 let end = start + size - 1;
485                 let range = AddressRange::from_start_and_end(start, end);
486 
487                 self.internal_allocate_from_slot(slot, range)
488             }
489             None => Err(PciDeviceError::OutOfSpace),
490         }
491     }
492 
493     // Allocates a range of addresses from the managed region with a required location.
494     // Overlapping with a previous allocation is allowed.
allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError>495     fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
496         if range.is_empty() {
497             return Err(PciDeviceError::SizeZero);
498         }
499 
500         while let Some(&slot) = self
501             .regions
502             .iter()
503             .find(|avail_range| avail_range.overlaps(range))
504         {
505             let _address = self.internal_allocate_from_slot(slot, range)?;
506         }
507         Ok(())
508     }
509 }
510 
511 struct VfioPciWorker {
512     address: PciAddress,
513     sysfs_path: PathBuf,
514     vm_socket: Tube,
515     name: String,
516     pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
517     msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
518 }
519 
520 impl VfioPciWorker {
run( &mut self, req_irq_evt: Event, wakeup_evt: Event, kill_evt: Event, msix_evt: Vec<Event>, )521     fn run(
522         &mut self,
523         req_irq_evt: Event,
524         wakeup_evt: Event,
525         kill_evt: Event,
526         msix_evt: Vec<Event>,
527     ) {
528         #[derive(EventToken)]
529         enum Token {
530             ReqIrq,
531             WakeUp,
532             Kill,
533             MsixIrqi { index: usize },
534         }
535 
536         let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
537             (&req_irq_evt, Token::ReqIrq),
538             (&wakeup_evt, Token::WakeUp),
539             (&kill_evt, Token::Kill),
540         ]) {
541             Ok(pc) => pc,
542             Err(e) => {
543                 error!(
544                     "{} failed creating vfio WaitContext: {}",
545                     self.name.clone(),
546                     e
547                 );
548                 return;
549             }
550         };
551 
552         for (index, msix_int) in msix_evt.iter().enumerate() {
553             wait_ctx
554                 .add(msix_int, Token::MsixIrqi { index })
555                 .expect("Failed to create vfio WaitContext for msix interrupt event")
556         }
557 
558         'wait: loop {
559             let events = match wait_ctx.wait() {
560                 Ok(v) => v,
561                 Err(e) => {
562                     error!("{} failed polling vfio events: {}", self.name.clone(), e);
563                     break;
564                 }
565             };
566 
567             for event in events.iter().filter(|e| e.is_readable) {
568                 match event.token {
569                     Token::MsixIrqi { index } => {
570                         if let Some(msix_cap) = &self.msix_cap {
571                             msix_cap.lock().trigger(index);
572                         }
573                     }
574                     Token::ReqIrq => {
575                         let device = HotPlugDeviceInfo {
576                             device_type: HotPlugDeviceType::EndPoint,
577                             path: self.sysfs_path.clone(),
578                             hp_interrupt: false,
579                         };
580 
581                         let request = VmRequest::HotPlugCommand { device, add: false };
582                         if self.vm_socket.send(&request).is_ok() {
583                             if let Err(e) = self.vm_socket.recv::<VmResponse>() {
584                                 error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
585                             } else {
586                                 break 'wait;
587                             }
588                         }
589                     }
590                     Token::WakeUp => {
591                         let _ = wakeup_evt.wait();
592                         if let Some(pm_cap) = &self.pm_cap {
593                             if pm_cap.lock().should_trigger_pme() {
594                                 let request = VmRequest::PciPme(self.address.pme_requester_id());
595                                 if self.vm_socket.send(&request).is_ok() {
596                                     if let Err(e) = self.vm_socket.recv::<VmResponse>() {
597                                         error!("{} failed to send PME: {}", self.name.clone(), e);
598                                     }
599                                 }
600                             }
601                         }
602                     }
603                     Token::Kill => break 'wait,
604                 }
605             }
606         }
607     }
608 }
609 
get_next_from_extcap_header(cap_header: u32) -> u32610 fn get_next_from_extcap_header(cap_header: u32) -> u32 {
611     (cap_header >> 20) & 0xffc
612 }
613 
is_skipped_ext_cap(cap_id: u16) -> bool614 fn is_skipped_ext_cap(cap_id: u16) -> bool {
615     matches!(
616         cap_id,
617         // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
618         PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
619     )
620 }
621 
622 enum DeviceData {
623     IntelGfxData { opregion_index: u32 },
624 }
625 
626 /// PCI Express Extended Capabilities information
627 #[derive(Copy, Clone)]
628 struct ExtCap {
629     /// cap offset in Configuration Space
630     offset: u32,
631     /// cap size
632     size: u32,
633     /// next offset, set next non-skipped offset for non-skipped ext cap
634     next: u16,
635     /// whether to be exposed to guest
636     is_skipped: bool,
637 }
638 
639 /// Implements the Vfio Pci device, then a pci device is added into vm
640 pub struct VfioPciDevice {
641     device: Arc<VfioDevice>,
642     config: VfioPciConfig,
643     hotplug: bool,
644     hotplug_bus_number: Option<u8>,
645     preferred_address: PciAddress,
646     pci_address: Option<PciAddress>,
647     interrupt_evt: Option<IrqLevelEvent>,
648     mmio_regions: Vec<PciBarConfiguration>,
649     io_regions: Vec<PciBarConfiguration>,
650     pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
651     msi_cap: Option<VfioMsiCap>,
652     msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
653     irq_type: Option<VfioIrqType>,
654     vm_socket_mem: Tube,
655     device_data: Option<DeviceData>,
656     pm_evt: Option<Event>,
657     worker_thread: Option<WorkerThread<VfioPciWorker>>,
658     vm_socket_vm: Option<Tube>,
659     sysfs_path: PathBuf,
660     #[cfg(feature = "direct")]
661     header_type_reg: Option<u32>,
662     // PCI Express Extended Capabilities
663     ext_caps: Vec<ExtCap>,
664     #[cfg(feature = "direct")]
665     is_intel_lpss: bool,
666     #[cfg(feature = "direct")]
667     supports_coordinated_pm: bool,
668     #[cfg(feature = "direct")]
669     i2c_devs: HashMap<u16, PathBuf>,
670     vcfg_shm_mmap: Option<MemoryMapping>,
671     mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<MemSlot>)>,
672     activated: bool,
673 }
674 
675 #[cfg(feature = "direct")]
iter_dir_starts_with( path: &Path, start: &'static str, ) -> anyhow::Result<impl Iterator<Item = fs::DirEntry>>676 fn iter_dir_starts_with(
677     path: &Path,
678     start: &'static str,
679 ) -> anyhow::Result<impl Iterator<Item = fs::DirEntry>> {
680     let dir = fs::read_dir(path)
681         .with_context(|| format!("read_dir call on {} failed", path.to_string_lossy()))?;
682     Ok(dir
683         .filter_map(|e| e.ok())
684         .filter(|e| e.file_type().map(|f| f.is_dir()).unwrap_or(false))
685         .filter(move |e| e.file_name().to_str().unwrap_or("").starts_with(start)))
686 }
687 
688 impl VfioPciDevice {
689     /// Constructs a new Vfio Pci device for the give Vfio device
new( sysfs_path: &Path, device: VfioDevice, hotplug: bool, hotplug_bus_number: Option<u8>, guest_address: Option<PciAddress>, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vfio_device_socket_mem: Tube, vfio_device_socket_vm: Tube, #[cfg(feature = "direct")] is_intel_lpss: bool, ) -> Result<Self, PciDeviceError>690     pub fn new(
691         sysfs_path: &Path,
692         device: VfioDevice,
693         hotplug: bool,
694         hotplug_bus_number: Option<u8>,
695         guest_address: Option<PciAddress>,
696         vfio_device_socket_msi: Tube,
697         vfio_device_socket_msix: Tube,
698         vfio_device_socket_mem: Tube,
699         vfio_device_socket_vm: Tube,
700         #[cfg(feature = "direct")] is_intel_lpss: bool,
701     ) -> Result<Self, PciDeviceError> {
702         let preferred_address = if let Some(bus_num) = hotplug_bus_number {
703             debug!("hotplug bus {}", bus_num);
704             PciAddress {
705                 // Caller specify pcie bus number for hotplug device
706                 bus: bus_num,
707                 // devfn should be 0, otherwise pcie root port couldn't detect it
708                 dev: 0,
709                 func: 0,
710             }
711         } else if let Some(guest_address) = guest_address {
712             debug!("guest PCI address {}", guest_address);
713             guest_address
714         } else {
715             let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
716                 PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
717             })?;
718             debug!("parsed device PCI address {}", addr);
719             addr
720         };
721 
722         let dev = Arc::new(device);
723         let config = VfioPciConfig::new(Arc::clone(&dev));
724         let mut msi_socket = Some(vfio_device_socket_msi);
725         let mut msix_socket = Some(vfio_device_socket_msix);
726         let mut msi_cap: Option<VfioMsiCap> = None;
727         let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
728         let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
729 
730         let mut is_pcie = false;
731         let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
732         let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
733         let device_id: u16 = config.read_config(PCI_DEVICE_ID);
734 
735         let pci_id = PciId::new(vendor_id, device_id);
736 
737         while cap_next != 0 {
738             let cap_id: u8 = config.read_config(cap_next);
739             if cap_id == PCI_CAP_ID_PM {
740                 pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
741             } else if cap_id == PCI_CAP_ID_MSI {
742                 if let Some(msi_socket) = msi_socket.take() {
743                     msi_cap = Some(VfioMsiCap::new(
744                         &config,
745                         cap_next,
746                         msi_socket,
747                         pci_id.into(),
748                         dev.device_name().to_string(),
749                     ));
750                 }
751             } else if cap_id == PCI_CAP_ID_MSIX {
752                 if let Some(msix_socket) = msix_socket.take() {
753                     msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
754                         &config,
755                         cap_next,
756                         msix_socket,
757                         pci_id.into(),
758                         dev.device_name().to_string(),
759                     ))));
760                 }
761             } else if cap_id == PciCapabilityID::PciExpress as u8 {
762                 is_pcie = true;
763             }
764             let offset = cap_next + PCI_MSI_NEXT_POINTER;
765             cap_next = config.read_config::<u8>(offset).into();
766         }
767 
768         let mut ext_caps: Vec<ExtCap> = Vec::new();
769         if is_pcie {
770             let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
771             while ext_cap_next != 0 {
772                 let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
773                 if ext_cap_config == 0 {
774                     break;
775                 }
776                 ext_caps.push(ExtCap {
777                     offset: ext_cap_next,
778                     // Calculate the size later
779                     size: 0,
780                     // init as the real value
781                     next: get_next_from_extcap_header(ext_cap_config) as u16,
782                     is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
783                 });
784                 ext_cap_next = get_next_from_extcap_header(ext_cap_config);
785             }
786 
787             // Manage extended caps
788             //
789             // Extended capabilities are chained with each pointing to the next, so
790             // we can drop anything other than the head of the chain simply by
791             // modifying the previous next pointer. For the head of the chain, we
792             // can modify the capability ID to something that cannot match a valid
793             // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
794             // supported.
795             //
796             // reverse order by offset
797             ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
798             let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
799             let mut non_skipped_next: u16 = 0;
800             for ext_cap in ext_caps.iter_mut() {
801                 if !ext_cap.is_skipped {
802                     ext_cap.next = non_skipped_next;
803                     non_skipped_next = ext_cap.offset as u16;
804                 } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
805                     ext_cap.next = non_skipped_next;
806                 }
807                 ext_cap.size = next_offset - ext_cap.offset;
808                 next_offset = ext_cap.offset;
809             }
810             // order by offset
811             ext_caps.reverse();
812         }
813 
814         let class_code: u8 = config.read_config(PCI_BASE_CLASS_CODE);
815 
816         let is_intel_gfx = vendor_id == PCI_VENDOR_ID_INTEL
817             && class_code == PciClassCode::DisplayController.get_register_value();
818         let device_data = if is_intel_gfx {
819             Some(DeviceData::IntelGfxData {
820                 opregion_index: u32::max_value(),
821             })
822         } else {
823             None
824         };
825 
826         #[cfg(feature = "direct")]
827         let mut i2c_devs: HashMap<u16, PathBuf> = HashMap::new();
828 
829         #[cfg(feature = "direct")]
830         let (supports_coordinated_pm, header_type_reg) =
831             match VfioPciDevice::coordinated_pm(sysfs_path, true) {
832                 Ok(_) => {
833                     if is_intel_lpss {
834                         if let Err(e) = VfioPciDevice::coordinated_pm_i2c(sysfs_path, &mut i2c_devs)
835                         {
836                             warn!("coordinated_pm_i2c not supported: {}", e);
837                             for (_, i2c_path) in i2c_devs.iter() {
838                                 let _ = VfioPciDevice::coordinated_pm(i2c_path, false);
839                             }
840                             i2c_devs.clear();
841                         }
842                     }
843 
844                     // Cache the dword at offset 0x0c (cacheline size, latency timer,
845                     // header type, BIST).
846                     // When using the "direct" feature, this dword can be accessed for
847                     // device power state. Directly accessing a device's physical PCI
848                     // config space in D3cold state causes a hang. We treat the cacheline
849                     // size, latency timer and header type field as immutable in the
850                     // guest.
851                     let reg: u32 = config.read_config((HEADER_TYPE_REG as u32) * 4);
852                     (true, Some(reg))
853                 }
854                 Err(e) => {
855                     warn!("coordinated_pm not supported: {}", e);
856                     (false, None)
857                 }
858             };
859 
860         Ok(VfioPciDevice {
861             device: dev,
862             config,
863             hotplug,
864             hotplug_bus_number,
865             preferred_address,
866             pci_address: None,
867             interrupt_evt: None,
868             mmio_regions: Vec::new(),
869             io_regions: Vec::new(),
870             pm_cap,
871             msi_cap,
872             msix_cap,
873             irq_type: None,
874             vm_socket_mem: vfio_device_socket_mem,
875             device_data,
876             pm_evt: None,
877             worker_thread: None,
878             vm_socket_vm: Some(vfio_device_socket_vm),
879             sysfs_path: sysfs_path.to_path_buf(),
880             #[cfg(feature = "direct")]
881             header_type_reg,
882             ext_caps,
883             #[cfg(feature = "direct")]
884             is_intel_lpss,
885             #[cfg(feature = "direct")]
886             supports_coordinated_pm,
887             #[cfg(feature = "direct")]
888             i2c_devs,
889             vcfg_shm_mmap: None,
890             mapped_mmio_bars: BTreeMap::new(),
891             activated: false,
892         })
893     }
894 
895     /// Gets the pci address of the device, if one has already been allocated.
pci_address(&self) -> Option<PciAddress>896     pub fn pci_address(&self) -> Option<PciAddress> {
897         self.pci_address
898     }
899 
is_intel_gfx(&self) -> bool900     fn is_intel_gfx(&self) -> bool {
901         let mut ret = false;
902 
903         if let Some(device_data) = &self.device_data {
904             match *device_data {
905                 DeviceData::IntelGfxData { .. } => ret = true,
906             }
907         }
908 
909         ret
910     }
911 
find_region(&self, addr: u64) -> Option<PciBarConfiguration>912     fn find_region(&self, addr: u64) -> Option<PciBarConfiguration> {
913         for mmio_info in self.mmio_regions.iter() {
914             if addr >= mmio_info.address() && addr < mmio_info.address() + mmio_info.size() {
915                 return Some(*mmio_info);
916             }
917         }
918 
919         None
920     }
921 
enable_intx(&mut self)922     fn enable_intx(&mut self) {
923         if let Some(ref interrupt_evt) = self.interrupt_evt {
924             if let Err(e) = self.device.irq_enable(
925                 &[Some(interrupt_evt.get_trigger())],
926                 VFIO_PCI_INTX_IRQ_INDEX,
927                 0,
928             ) {
929                 error!("{} Intx enable failed: {}", self.debug_label(), e);
930                 return;
931             }
932             if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
933                 error!("{} Intx mask failed: {}", self.debug_label(), e);
934                 self.disable_intx();
935                 return;
936             }
937             if let Err(e) = self
938                 .device
939                 .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
940             {
941                 error!("{} resample enable failed: {}", self.debug_label(), e);
942                 self.disable_intx();
943                 return;
944             }
945             if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
946                 error!("{} Intx unmask failed: {}", self.debug_label(), e);
947                 self.disable_intx();
948                 return;
949             }
950             self.irq_type = Some(VfioIrqType::Intx);
951         }
952     }
953 
disable_intx(&mut self)954     fn disable_intx(&mut self) {
955         if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
956             error!("{} Intx disable failed: {}", self.debug_label(), e);
957         }
958         self.irq_type = None;
959     }
960 
disable_irqs(&mut self)961     fn disable_irqs(&mut self) {
962         match self.irq_type {
963             Some(VfioIrqType::Msi) => self.disable_msi(),
964             Some(VfioIrqType::Msix) => self.disable_msix(),
965             _ => (),
966         }
967 
968         // Above disable_msi() or disable_msix() will enable intx again.
969         // so disable_intx here again.
970         if let Some(VfioIrqType::Intx) = self.irq_type {
971             self.disable_intx();
972         }
973     }
974 
enable_msi(&mut self)975     fn enable_msi(&mut self) {
976         self.disable_irqs();
977 
978         let irqfd = match &self.msi_cap {
979             Some(cap) => {
980                 if let Some(fd) = cap.get_msi_irqfd() {
981                     fd
982                 } else {
983                     self.enable_intx();
984                     return;
985                 }
986             }
987             None => {
988                 self.enable_intx();
989                 return;
990             }
991         };
992 
993         if let Err(e) = self
994             .device
995             .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
996         {
997             error!("{} failed to enable msi: {}", self.debug_label(), e);
998             self.enable_intx();
999             return;
1000         }
1001 
1002         self.irq_type = Some(VfioIrqType::Msi);
1003     }
1004 
disable_msi(&mut self)1005     fn disable_msi(&mut self) {
1006         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
1007             error!("{} failed to disable msi: {}", self.debug_label(), e);
1008             return;
1009         }
1010         self.irq_type = None;
1011 
1012         self.enable_intx();
1013     }
1014 
enable_msix(&mut self)1015     fn enable_msix(&mut self) {
1016         if self.msix_cap.is_none() {
1017             return;
1018         }
1019 
1020         self.disable_irqs();
1021         let cap = self.msix_cap.as_ref().unwrap().lock();
1022         let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1023 
1024         let mut failed = false;
1025         if !vector_in_use {
1026             // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1027             // to vector 0. Then we enable it and immediately disable it, so that vfio will
1028             // activate physical device. If there are available msix vectors, just enable them
1029             // instead.
1030             let fd = Event::new().expect("failed to create event");
1031             let table_size = cap.table_size();
1032             let mut irqfds = vec![None; table_size];
1033             irqfds[0] = Some(&fd);
1034             for fd in irqfds.iter_mut().skip(1) {
1035                 *fd = None;
1036             }
1037             if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1038                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1039                 failed = true;
1040             }
1041             irqfds[0] = None;
1042             if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1043                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1044                 failed = true;
1045             }
1046         } else {
1047             let result = self
1048                 .device
1049                 .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1050             if let Err(e) = result {
1051                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1052                 failed = true;
1053             }
1054         }
1055 
1056         std::mem::drop(cap);
1057         if failed {
1058             self.enable_intx();
1059             return;
1060         }
1061         self.irq_type = Some(VfioIrqType::Msix);
1062     }
1063 
disable_msix(&mut self)1064     fn disable_msix(&mut self) {
1065         if self.msix_cap.is_none() {
1066             return;
1067         }
1068         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1069             error!("{} failed to disable msix: {}", self.debug_label(), e);
1070             return;
1071         }
1072         self.irq_type = None;
1073         self.enable_intx();
1074     }
1075 
msix_vectors_update(&self) -> Result<(), VfioError>1076     fn msix_vectors_update(&self) -> Result<(), VfioError> {
1077         if let Some(cap) = &self.msix_cap {
1078             self.device
1079                 .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1080         }
1081         Ok(())
1082     }
1083 
msix_vector_update(&self, index: usize, irqfd: Option<&Event>)1084     fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1085         if let Err(e) = self
1086             .device
1087             .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1088         {
1089             error!(
1090                 "{} failed to update msix vector {}: {}",
1091                 self.debug_label(),
1092                 index,
1093                 e
1094             );
1095         }
1096     }
1097 
adjust_bar_mmap( &self, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, remove_mmaps: &[AddressRange], ) -> Vec<vfio_region_sparse_mmap_area>1098     fn adjust_bar_mmap(
1099         &self,
1100         bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1101         remove_mmaps: &[AddressRange],
1102     ) -> Vec<vfio_region_sparse_mmap_area> {
1103         let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1104         let pgmask = (pagesize() as u64) - 1;
1105 
1106         for mmap in bar_mmaps.iter() {
1107             let mmap_range = if let Some(mmap_range) =
1108                 AddressRange::from_start_and_size(mmap.offset as u64, mmap.size as u64)
1109             {
1110                 mmap_range
1111             } else {
1112                 continue;
1113             };
1114             let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1115                 Ok(a) => a,
1116                 Err(e) => {
1117                     error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1118                     mmaps.clear();
1119                     return mmaps;
1120                 }
1121             };
1122 
1123             for &(mut remove_range) in remove_mmaps.iter() {
1124                 remove_range = remove_range.intersect(mmap_range);
1125                 if !remove_range.is_empty() {
1126                     // align offsets to page size
1127                     let begin = remove_range.start & !pgmask;
1128                     let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1129                     let remove_range = AddressRange::from_start_and_end(begin, end);
1130                     if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1131                         error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1132                     }
1133                 }
1134             }
1135 
1136             for mmap in to_mmap.regions {
1137                 mmaps.push(vfio_region_sparse_mmap_area {
1138                     offset: mmap.start,
1139                     size: mmap.end - mmap.start + 1,
1140                 });
1141             }
1142         }
1143 
1144         mmaps
1145     }
1146 
remove_bar_mmap_msix( &self, bar_index: u32, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1147     fn remove_bar_mmap_msix(
1148         &self,
1149         bar_index: u32,
1150         bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1151     ) -> Vec<vfio_region_sparse_mmap_area> {
1152         let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1153         let mut msix_regions = Vec::new();
1154 
1155         if let Some(t) = msix_cap.get_msix_table(bar_index) {
1156             msix_regions.push(t);
1157         }
1158         if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1159             msix_regions.push(p);
1160         }
1161 
1162         if msix_regions.is_empty() {
1163             return bar_mmaps;
1164         }
1165 
1166         self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1167     }
1168 
1169     #[cfg(feature = "direct")]
remove_bar_mmap_lpss( &self, bar_index: u32, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1170     fn remove_bar_mmap_lpss(
1171         &self,
1172         bar_index: u32,
1173         bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1174     ) -> Vec<vfio_region_sparse_mmap_area> {
1175         // must be BAR0
1176         if bar_index != 0 {
1177             return bar_mmaps;
1178         }
1179 
1180         match AddressRange::from_start_and_size(LPSS_MANATEE_OFFSET, LPSS_MANATEE_SIZE) {
1181             Some(lpss_range) => self.adjust_bar_mmap(bar_mmaps, &[lpss_range]),
1182             None => bar_mmaps,
1183         }
1184     }
1185 
add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemSlot>1186     fn add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemSlot> {
1187         let mut mmaps_slots: Vec<MemSlot> = Vec::new();
1188         if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1189             // the bar storing msix table and pba couldn't mmap.
1190             // these bars should be trapped, so that msix could be emulated.
1191             let mut mmaps = self.device.get_region_mmap(index);
1192 
1193             if self.msix_cap.is_some() {
1194                 mmaps = self.remove_bar_mmap_msix(index, mmaps);
1195             }
1196             #[cfg(feature = "direct")]
1197             if self.is_intel_lpss {
1198                 mmaps = self.remove_bar_mmap_lpss(index, mmaps);
1199             }
1200             if mmaps.is_empty() {
1201                 return mmaps_slots;
1202             }
1203 
1204             for mmap in mmaps.iter() {
1205                 let mmap_offset = mmap.offset;
1206                 let mmap_size = mmap.size;
1207                 let guest_map_start = bar_addr + mmap_offset;
1208                 let region_offset = self.device.get_region_offset(index);
1209                 let offset = region_offset + mmap_offset;
1210                 let descriptor = match self.device.device_file().try_clone() {
1211                     Ok(device_file) => device_file.into(),
1212                     Err(_) => break,
1213                 };
1214                 if self
1215                     .vm_socket_mem
1216                     .send(&VmMemoryRequest::RegisterMemory {
1217                         source: VmMemorySource::Descriptor {
1218                             descriptor,
1219                             offset,
1220                             size: mmap_size,
1221                         },
1222                         dest: VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1223                         prot: Protection::read_write(),
1224                     })
1225                     .is_err()
1226                 {
1227                     break;
1228                 }
1229 
1230                 let response: VmMemoryResponse = match self.vm_socket_mem.recv() {
1231                     Ok(res) => res,
1232                     Err(_) => break,
1233                 };
1234                 match response {
1235                     VmMemoryResponse::RegisterMemory { pfn: _, slot } => {
1236                         mmaps_slots.push(slot);
1237                     }
1238                     _ => break,
1239                 }
1240             }
1241         }
1242 
1243         mmaps_slots
1244     }
1245 
remove_bar_mmap(&self, mmap_slots: &[MemSlot])1246     fn remove_bar_mmap(&self, mmap_slots: &[MemSlot]) {
1247         for mmap_slot in mmap_slots {
1248             if self
1249                 .vm_socket_mem
1250                 .send(&VmMemoryRequest::UnregisterMemory(*mmap_slot))
1251                 .is_err()
1252             {
1253                 error!("failed to send UnregisterMemory request");
1254                 return;
1255             }
1256             if self.vm_socket_mem.recv::<VmMemoryResponse>().is_err() {
1257                 error!("failed to receive UnregisterMemory response");
1258             }
1259         }
1260     }
1261 
disable_bars_mmap(&mut self)1262     fn disable_bars_mmap(&mut self) {
1263         for (_, (_, mmap_slots)) in self.mapped_mmio_bars.iter() {
1264             self.remove_bar_mmap(mmap_slots);
1265         }
1266         self.mapped_mmio_bars.clear();
1267     }
1268 
commit_bars_mmap(&mut self)1269     fn commit_bars_mmap(&mut self) {
1270         // Unmap all bars before remapping bars, to prevent issues with overlap
1271         let mut needs_map = Vec::new();
1272         for mmio_info in self.mmio_regions.iter() {
1273             let bar_idx = mmio_info.bar_index();
1274             let addr = mmio_info.address();
1275 
1276             if let Some((cur_addr, slots)) = self.mapped_mmio_bars.remove(&bar_idx) {
1277                 if cur_addr == addr {
1278                     self.mapped_mmio_bars.insert(bar_idx, (cur_addr, slots));
1279                     continue;
1280                 } else {
1281                     self.remove_bar_mmap(&slots);
1282                 }
1283             }
1284 
1285             if addr != 0 {
1286                 needs_map.push((bar_idx, addr));
1287             }
1288         }
1289 
1290         for (bar_idx, addr) in needs_map.iter() {
1291             let slots = self.add_bar_mmap(*bar_idx as u32, *addr);
1292             self.mapped_mmio_bars.insert(*bar_idx, (*addr, slots));
1293         }
1294     }
1295 
close(&mut self)1296     fn close(&mut self) {
1297         if let Some(msi) = self.msi_cap.as_mut() {
1298             msi.destroy();
1299         }
1300         if let Some(msix) = &self.msix_cap {
1301             msix.lock().destroy();
1302         }
1303         self.disable_bars_mmap();
1304         self.device.close();
1305     }
1306 
start_work_thread(&mut self)1307     fn start_work_thread(&mut self) {
1308         let vm_socket = match self.vm_socket_vm.take() {
1309             Some(socket) => socket,
1310             None => return,
1311         };
1312 
1313         let req_evt = match Event::new() {
1314             Ok(evt) => {
1315                 if let Err(e) = self
1316                     .device
1317                     .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1318                 {
1319                     error!("{} enable req_irq failed: {}", self.debug_label(), e);
1320                     return;
1321                 }
1322                 evt
1323             }
1324             Err(_) => return,
1325         };
1326 
1327         let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1328             Ok(v) => v,
1329             Err(e) => {
1330                 error!(
1331                     "{} failed creating PM Event pair: {}",
1332                     self.debug_label(),
1333                     e
1334                 );
1335                 return;
1336             }
1337         };
1338         self.pm_evt = Some(self_pm_evt);
1339 
1340         let mut msix_evt = Vec::new();
1341         if let Some(msix_cap) = &self.msix_cap {
1342             msix_evt = msix_cap.lock().clone_msix_evt();
1343         }
1344 
1345         let name = self.device.device_name().to_string();
1346         let address = self.pci_address.expect("Unassigned PCI Address.");
1347         let sysfs_path = self.sysfs_path.clone();
1348         let pm_cap = self.pm_cap.clone();
1349         let msix_cap = self.msix_cap.clone();
1350         self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1351             let mut worker = VfioPciWorker {
1352                 address,
1353                 sysfs_path,
1354                 vm_socket,
1355                 name,
1356                 pm_cap,
1357                 msix_cap,
1358             };
1359             worker.run(req_evt, pm_evt, kill_evt, msix_evt);
1360             worker
1361         }));
1362         self.activated = true;
1363     }
1364 
collect_bars(&mut self) -> Vec<PciBarConfiguration>1365     fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1366         let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1367         let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1368 
1369         while i <= VFIO_PCI_ROM_REGION_INDEX {
1370             let mut low: u32 = 0xffffffff;
1371             let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1372                 0x30
1373             } else {
1374                 0x10 + i * 4
1375             };
1376             self.config.write_config(low, offset);
1377             low = self.config.read_config(offset);
1378 
1379             let low_flag = low & 0xf;
1380             let is_64bit = low_flag & 0x4 == 0x4;
1381             if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1382                 let mut upper: u32 = 0xffffffff;
1383                 if is_64bit {
1384                     self.config.write_config(upper, offset + 4);
1385                     upper = self.config.read_config(offset + 4);
1386                 }
1387 
1388                 low &= 0xffff_fff0;
1389                 let mut size: u64 = u64::from(upper);
1390                 size <<= 32;
1391                 size |= u64::from(low);
1392                 size = !size + 1;
1393                 let region_type = if is_64bit {
1394                     PciBarRegionType::Memory64BitRegion
1395                 } else {
1396                     PciBarRegionType::Memory32BitRegion
1397                 };
1398                 let prefetch = if low_flag & 0x8 == 0x8 {
1399                     PciBarPrefetchable::Prefetchable
1400                 } else {
1401                     PciBarPrefetchable::NotPrefetchable
1402                 };
1403                 mem_bars.push(PciBarConfiguration::new(
1404                     i as usize,
1405                     size,
1406                     region_type,
1407                     prefetch,
1408                 ));
1409             } else if low_flag & 0x1 == 0x1 {
1410                 let size = !(low & 0xffff_fffc) + 1;
1411                 self.io_regions.push(PciBarConfiguration::new(
1412                     i as usize,
1413                     size.into(),
1414                     PciBarRegionType::IoRegion,
1415                     PciBarPrefetchable::NotPrefetchable,
1416                 ));
1417             }
1418 
1419             if is_64bit {
1420                 i += 2;
1421             } else {
1422                 i += 1;
1423             }
1424         }
1425         mem_bars
1426     }
1427 
configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64)1428     fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1429         let offset: u32 = bar_info.reg_index() as u32 * 4;
1430         let mmio_region = *bar_info;
1431         self.mmio_regions.push(mmio_region.set_address(bar_addr));
1432 
1433         let val: u32 = self.config.read_config(offset);
1434         let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1435         self.config.write_config(low, offset);
1436         if bar_info.is_64bit_memory() {
1437             let upper = (bar_addr >> 32) as u32;
1438             self.config.write_config(upper, offset + 4);
1439         }
1440     }
1441 
allocate_root_barmem( &mut self, mem_bars: &[PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1442     fn allocate_root_barmem(
1443         &mut self,
1444         mem_bars: &[PciBarConfiguration],
1445         resources: &mut SystemAllocator,
1446     ) -> Result<Vec<BarRange>, PciDeviceError> {
1447         let address = self.pci_address.unwrap();
1448         let mut ranges: Vec<BarRange> = Vec::new();
1449         for mem_bar in mem_bars {
1450             let bar_size = mem_bar.size();
1451             let mut bar_addr: u64 = 0;
1452             // Don't allocate mmio for hotplug device, OS will allocate it from
1453             // its parent's bridge window.
1454             if !self.hotplug {
1455                 bar_addr = resources
1456                     .allocate_mmio(
1457                         bar_size,
1458                         Alloc::PciBar {
1459                             bus: address.bus,
1460                             dev: address.dev,
1461                             func: address.func,
1462                             bar: mem_bar.bar_index() as u8,
1463                         },
1464                         "vfio_bar".to_string(),
1465                         AllocOptions::new()
1466                             .prefetchable(mem_bar.is_prefetchable())
1467                             .max_address(if mem_bar.is_64bit_memory() {
1468                                 u64::MAX
1469                             } else {
1470                                 u32::MAX.into()
1471                             })
1472                             .align(bar_size),
1473                     )
1474                     .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1475                 ranges.push(BarRange {
1476                     addr: bar_addr,
1477                     size: bar_size,
1478                     prefetchable: mem_bar.is_prefetchable(),
1479                 });
1480             }
1481             self.configure_barmem(mem_bar, bar_addr);
1482         }
1483         Ok(ranges)
1484     }
1485 
allocate_nonroot_barmem( &mut self, mem_bars: &mut [PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1486     fn allocate_nonroot_barmem(
1487         &mut self,
1488         mem_bars: &mut [PciBarConfiguration],
1489         resources: &mut SystemAllocator,
1490     ) -> Result<Vec<BarRange>, PciDeviceError> {
1491         const NON_PREFETCHABLE: usize = 0;
1492         const PREFETCHABLE: usize = 1;
1493         const ARRAY_SIZE: usize = 2;
1494         let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1495         let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1496             match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1497                 Ok(a) => a,
1498                 Err(e) => {
1499                     error!(
1500                         "{} init nonroot VfioResourceAllocator failed: {}",
1501                         self.debug_label(),
1502                         e
1503                     );
1504                     return Err(e);
1505                 }
1506             },
1507             match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1508                 Ok(a) => a,
1509                 Err(e) => {
1510                     error!(
1511                         "{} init nonroot VfioResourceAllocator failed: {}",
1512                         self.debug_label(),
1513                         e
1514                     );
1515                     return Err(e);
1516                 }
1517             },
1518         ];
1519         let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1520         // the window must be 1M-aligned as per the PCI spec
1521         let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1522         let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1523 
1524         // Descend by bar size, this could reduce allocated size for all the bars.
1525         mem_bars.sort_by_key(|a| Reverse(a.size()));
1526         for mem_bar in mem_bars {
1527             let prefetchable = mem_bar.is_prefetchable();
1528             let is_64bit = mem_bar.is_64bit_memory();
1529 
1530             // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1531             // as all the prefetchable bars should be in one region
1532             if prefetchable && !is_64bit {
1533                 memtype[PREFETCHABLE] = MmioType::Low;
1534             }
1535             let i = if prefetchable {
1536                 PREFETCHABLE
1537             } else {
1538                 NON_PREFETCHABLE
1539             };
1540             let bar_size = mem_bar.size();
1541             let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1542                 Ok(s) => s,
1543                 Err(e) => {
1544                     error!(
1545                         "{} nonroot allocate_wit_align failed: {}",
1546                         self.debug_label(),
1547                         e
1548                     );
1549                     return Err(e);
1550                 }
1551             };
1552             window_sz[i] = max(window_sz[i], start + bar_size);
1553             alignment[i] = max(alignment[i], bar_size);
1554             let mem_info = (*mem_bar).set_address(start);
1555             membars[i].push(mem_info);
1556         }
1557 
1558         let address = self.pci_address.unwrap();
1559         let mut ranges: Vec<BarRange> = Vec::new();
1560         for (index, bars) in membars.iter().enumerate() {
1561             if bars.is_empty() {
1562                 continue;
1563             }
1564 
1565             let i = if index == 1 {
1566                 PREFETCHABLE
1567             } else {
1568                 NON_PREFETCHABLE
1569             };
1570             let mut window_addr: u64 = 0;
1571             // Don't allocate mmio for hotplug device, OS will allocate it from
1572             // its parent's bridge window.
1573             if !self.hotplug {
1574                 window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1575                 let alloc = if i == NON_PREFETCHABLE {
1576                     Alloc::PciBridgeWindow {
1577                         bus: address.bus,
1578                         dev: address.dev,
1579                         func: address.func,
1580                     }
1581                 } else {
1582                     Alloc::PciBridgePrefetchWindow {
1583                         bus: address.bus,
1584                         dev: address.dev,
1585                         func: address.func,
1586                     }
1587                 };
1588                 window_addr = resources
1589                     .mmio_allocator(memtype[i])
1590                     .allocate_with_align(
1591                         window_sz[i],
1592                         alloc,
1593                         "vfio_bar_window".to_string(),
1594                         alignment[i],
1595                     )
1596                     .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1597                 for mem_info in bars {
1598                     let bar_addr = window_addr + mem_info.address();
1599                     ranges.push(BarRange {
1600                         addr: bar_addr,
1601                         size: mem_info.size(),
1602                         prefetchable: mem_info.is_prefetchable(),
1603                     });
1604                 }
1605             }
1606 
1607             for mem_info in bars {
1608                 let bar_addr = window_addr + mem_info.address();
1609                 self.configure_barmem(mem_info, bar_addr);
1610             }
1611         }
1612         Ok(ranges)
1613     }
1614 
1615     /// Return the supported iova max address of the Vfio Pci device
get_max_iova(&self) -> u641616     pub fn get_max_iova(&self) -> u64 {
1617         self.device.get_max_addr()
1618     }
1619 
1620     #[cfg(feature = "direct")]
coordinated_pm(sysfs_path: &Path, enter: bool) -> anyhow::Result<()>1621     fn coordinated_pm(sysfs_path: &Path, enter: bool) -> anyhow::Result<()> {
1622         let path = sysfs_path.join("power/coordinated");
1623         fs::write(&path, if enter { "enter\n" } else { "exit\n" })
1624             .with_context(|| format!("Failed to write to {}", path.to_string_lossy()))
1625     }
1626 
1627     #[cfg(feature = "direct")]
coordinated_pm_i2c_adap( adap_path: &Path, i2c_devs: &mut HashMap<u16, PathBuf>, ) -> anyhow::Result<()>1628     fn coordinated_pm_i2c_adap(
1629         adap_path: &Path,
1630         i2c_devs: &mut HashMap<u16, PathBuf>,
1631     ) -> anyhow::Result<()> {
1632         for entry in iter_dir_starts_with(adap_path, "i2c-")? {
1633             let path = adap_path.join(entry.file_name());
1634 
1635             VfioPciDevice::coordinated_pm(&path, true)?;
1636 
1637             let addr_path = path.join("address");
1638             let addr = fs::read_to_string(&addr_path).with_context(|| {
1639                 format!(
1640                     "Failed to read to string from {}",
1641                     addr_path.to_string_lossy()
1642                 )
1643             })?;
1644             let addr = addr.trim_end().parse::<u16>().with_context(|| {
1645                 format!(
1646                     "Failed to parse {} from {}",
1647                     addr,
1648                     addr_path.to_string_lossy()
1649                 )
1650             })?;
1651 
1652             if let Some(c) = i2c_devs.insert(addr, path.to_path_buf()) {
1653                 anyhow::bail!(
1654                     "Collision encountered: {}, {}",
1655                     path.to_string_lossy(),
1656                     c.to_string_lossy()
1657                 );
1658             }
1659         }
1660         Ok(())
1661     }
1662 
1663     #[cfg(feature = "direct")]
coordinated_pm_i2c_platdev( plat_path: &Path, i2c_devs: &mut HashMap<u16, PathBuf>, ) -> anyhow::Result<()>1664     fn coordinated_pm_i2c_platdev(
1665         plat_path: &Path,
1666         i2c_devs: &mut HashMap<u16, PathBuf>,
1667     ) -> anyhow::Result<()> {
1668         for entry in iter_dir_starts_with(plat_path, "i2c-")? {
1669             let path = plat_path.join(entry.file_name());
1670             VfioPciDevice::coordinated_pm_i2c_adap(&path, i2c_devs)?;
1671         }
1672         Ok(())
1673     }
1674 
1675     #[cfg(feature = "direct")]
coordinated_pm_i2c( sysfs_path: &Path, i2c_devs: &mut HashMap<u16, PathBuf>, ) -> anyhow::Result<()>1676     fn coordinated_pm_i2c(
1677         sysfs_path: &Path,
1678         i2c_devs: &mut HashMap<u16, PathBuf>,
1679     ) -> anyhow::Result<()> {
1680         for entry in iter_dir_starts_with(sysfs_path, "i2c_designware")? {
1681             let path = sysfs_path.join(entry.file_name());
1682             VfioPciDevice::coordinated_pm_i2c_platdev(&path, i2c_devs)?;
1683         }
1684         Ok(())
1685     }
1686 
1687     #[cfg(feature = "direct")]
power_state(&self) -> anyhow::Result<u8>1688     fn power_state(&self) -> anyhow::Result<u8> {
1689         let path = self.sysfs_path.join("power_state");
1690         let state = fs::read_to_string(&path)
1691             .with_context(|| format!("Failed to read from {}", path.to_string_lossy()))?;
1692         match state.as_str() {
1693             "D0\n" => Ok(0),
1694             "D1\n" => Ok(1),
1695             "D2\n" => Ok(2),
1696             "D3hot\n" => Ok(3),
1697             "D3cold\n" => Ok(4),
1698             "unknown\n" => Ok(5),
1699             _ => Err(std::io::Error::new(
1700                 std::io::ErrorKind::InvalidData,
1701                 "invalid state",
1702             ))?,
1703         }
1704     }
1705 
1706     #[cfg(feature = "direct")]
op_call(path: &Path, id: u8) -> anyhow::Result<()>1707     fn op_call(path: &Path, id: u8) -> anyhow::Result<()> {
1708         let path = path.join("power/op_call");
1709         fs::write(&path, &[id])
1710             .with_context(|| format!("Failed to write to {}", path.to_string_lossy()))
1711     }
1712 
get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap>1713     fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1714         self.ext_caps
1715             .iter()
1716             .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1717             .cloned()
1718     }
1719 
is_skipped_reg(&self, reg: u32) -> bool1720     fn is_skipped_reg(&self, reg: u32) -> bool {
1721         // fast handle for pci config space
1722         if reg < PCI_CONFIG_SPACE_SIZE {
1723             return false;
1724         }
1725 
1726         self.get_ext_cap_by_reg(reg)
1727             .map_or(false, |cap| cap.is_skipped)
1728     }
1729 }
1730 
1731 impl PciDevice for VfioPciDevice {
debug_label(&self) -> String1732     fn debug_label(&self) -> String {
1733         format!("vfio {} device", self.device.device_name())
1734     }
1735 
preferred_address(&self) -> Option<PciAddress>1736     fn preferred_address(&self) -> Option<PciAddress> {
1737         Some(self.preferred_address)
1738     }
1739 
allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>1740     fn allocate_address(
1741         &mut self,
1742         resources: &mut SystemAllocator,
1743     ) -> Result<PciAddress, PciDeviceError> {
1744         if self.pci_address.is_none() {
1745             let mut address = self.preferred_address;
1746             while address.func < 8 {
1747                 if resources.reserve_pci(
1748                     Alloc::PciBar {
1749                         bus: address.bus,
1750                         dev: address.dev,
1751                         func: address.func,
1752                         bar: 0,
1753                     },
1754                     self.debug_label(),
1755                 ) {
1756                     self.pci_address = Some(address);
1757                     break;
1758                 } else if self.hotplug_bus_number.is_none() {
1759                     break;
1760                 } else {
1761                     address.func += 1;
1762                 }
1763             }
1764         }
1765         self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1766     }
1767 
keep_rds(&self) -> Vec<RawDescriptor>1768     fn keep_rds(&self) -> Vec<RawDescriptor> {
1769         let mut rds = self.device.keep_rds();
1770         if let Some(ref interrupt_evt) = self.interrupt_evt {
1771             rds.extend(interrupt_evt.as_raw_descriptors());
1772         }
1773         rds.push(self.vm_socket_mem.as_raw_descriptor());
1774         if let Some(vm_socket_vm) = &self.vm_socket_vm {
1775             rds.push(vm_socket_vm.as_raw_descriptor());
1776         }
1777         if let Some(msi_cap) = &self.msi_cap {
1778             rds.push(msi_cap.config.get_msi_socket());
1779         }
1780         if let Some(msix_cap) = &self.msix_cap {
1781             rds.push(msix_cap.lock().config.as_raw_descriptor());
1782         }
1783         rds
1784     }
1785 
preferred_irq(&self) -> PreferredIrq1786     fn preferred_irq(&self) -> PreferredIrq {
1787         // Is INTx configured?
1788         let pin = match self.config.read_config::<u8>(PCI_INTERRUPT_PIN) {
1789             1 => PciInterruptPin::IntA,
1790             2 => PciInterruptPin::IntB,
1791             3 => PciInterruptPin::IntC,
1792             4 => PciInterruptPin::IntD,
1793             _ => return PreferredIrq::None,
1794         };
1795 
1796         // TODO: replace sysfs/irq value parsing with vfio interface
1797         //       reporting host allocated interrupt number and type.
1798         let path = self.sysfs_path.join("irq");
1799         let gsi = fs::read_to_string(path)
1800             .map(|v| v.trim().parse::<u32>().unwrap_or(0))
1801             .unwrap_or(0);
1802 
1803         PreferredIrq::Fixed { pin, gsi }
1804     }
1805 
assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32)1806     fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1807         // Keep event/resample event references.
1808         self.interrupt_evt = Some(irq_evt);
1809 
1810         // enable INTX
1811         self.enable_intx();
1812 
1813         self.config
1814             .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1815         self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1816     }
1817 
allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1818     fn allocate_io_bars(
1819         &mut self,
1820         resources: &mut SystemAllocator,
1821     ) -> Result<Vec<BarRange>, PciDeviceError> {
1822         let address = self
1823             .pci_address
1824             .expect("allocate_address must be called prior to allocate_device_bars");
1825 
1826         let mut mem_bars = self.collect_bars();
1827 
1828         let ranges = if address.bus == 0 {
1829             self.allocate_root_barmem(&mem_bars, resources)?
1830         } else {
1831             self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1832         };
1833 
1834         // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1835         // driver doesn't claim this vga device, then xorg couldn't boot up.
1836         if self.is_intel_gfx() {
1837             let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1838             cmd |= PCI_COMMAND_MEMORY;
1839             self.config.write_config(cmd, PCI_COMMAND);
1840         }
1841         Ok(ranges)
1842     }
1843 
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1844     fn allocate_device_bars(
1845         &mut self,
1846         resources: &mut SystemAllocator,
1847     ) -> Result<Vec<BarRange>, PciDeviceError> {
1848         let mut ranges: Vec<BarRange> = Vec::new();
1849 
1850         if !self.is_intel_gfx() {
1851             return Ok(ranges);
1852         }
1853 
1854         // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1855         // then write this gpa into pci cfg register
1856         if let Some((index, size)) = self.device.get_cap_type_info(
1857             VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1858             VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1859         ) {
1860             let address = self
1861                 .pci_address
1862                 .expect("allocate_address must be called prior to allocate_device_bars");
1863             let bar_addr = resources
1864                 .allocate_mmio(
1865                     size,
1866                     Alloc::PciBar {
1867                         bus: address.bus,
1868                         dev: address.dev,
1869                         func: address.func,
1870                         bar: (index * 4) as u8,
1871                     },
1872                     "vfio_bar".to_string(),
1873                     AllocOptions::new().max_address(u32::MAX.into()),
1874                 )
1875                 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1876             ranges.push(BarRange {
1877                 addr: bar_addr,
1878                 size,
1879                 prefetchable: false,
1880             });
1881             self.device_data = Some(DeviceData::IntelGfxData {
1882                 opregion_index: index,
1883             });
1884 
1885             self.mmio_regions.push(
1886                 PciBarConfiguration::new(
1887                     index as usize,
1888                     size,
1889                     PciBarRegionType::Memory32BitRegion,
1890                     PciBarPrefetchable::NotPrefetchable,
1891                 )
1892                 .set_address(bar_addr),
1893             );
1894             self.config.write_config(bar_addr as u32, 0xFC);
1895         }
1896 
1897         Ok(ranges)
1898     }
1899 
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1900     fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1901         for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1902             if region.bar_index() == bar_num {
1903                 let command: u8 = self.config.read_config(PCI_COMMAND);
1904                 if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1905                     return None;
1906                 } else {
1907                     return Some(*region);
1908                 }
1909             }
1910         }
1911 
1912         None
1913     }
1914 
register_device_capabilities(&mut self) -> Result<(), PciDeviceError>1915     fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1916         Ok(())
1917     }
1918 
read_config_register(&self, reg_idx: usize) -> u321919     fn read_config_register(&self, reg_idx: usize) -> u32 {
1920         #[cfg(feature = "direct")]
1921         if reg_idx == HEADER_TYPE_REG {
1922             if let Some(header_type_reg) = self.header_type_reg {
1923                 let mut v = header_type_reg.to_le_bytes();
1924                 // HACK
1925                 // Reads from the "BIST" register are interpreted as device
1926                 // PCI power state
1927                 v[3] = self.power_state().unwrap_or_else(|e| {
1928                     error!("Failed to get device power state: {}", e);
1929                     5 // unknown state
1930                 });
1931                 return u32::from_le_bytes(v);
1932             }
1933         }
1934 
1935         let reg: u32 = (reg_idx * 4) as u32;
1936         let mut config: u32 = self.config.read_config(reg);
1937 
1938         // See VfioPciDevice::new for details how extended caps are managed
1939         if reg >= PCI_CONFIG_SPACE_SIZE {
1940             let ext_cap = self.get_ext_cap_by_reg(reg);
1941             if let Some(ext_cap) = ext_cap {
1942                 if ext_cap.offset == reg {
1943                     config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1944                 }
1945 
1946                 if ext_cap.is_skipped {
1947                     if reg == PCI_CONFIG_SPACE_SIZE {
1948                         config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1949                     } else {
1950                         config = 0;
1951                     }
1952                 }
1953             }
1954         }
1955 
1956         // Ignore IO bar
1957         if (0x10..=0x24).contains(&reg) {
1958             let bar_idx = (reg as usize - 0x10) / 4;
1959             if let Some(bar) = self.get_bar_configuration(bar_idx) {
1960                 if bar.is_io() {
1961                     config = 0;
1962                 }
1963             }
1964         } else if let Some(msix_cap) = &self.msix_cap {
1965             let msix_cap = msix_cap.lock();
1966             if msix_cap.is_msix_control_reg(reg, 4) {
1967                 msix_cap.read_msix_control(&mut config);
1968             }
1969         } else if let Some(pm_cap) = &self.pm_cap {
1970             let pm_cap = pm_cap.lock();
1971             if pm_cap.is_pm_reg(reg) {
1972                 config = pm_cap.read(reg);
1973             }
1974         }
1975 
1976         // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1977         if self.is_intel_gfx() && reg == 0x50 {
1978             config &= 0xffff00ff;
1979         }
1980 
1981         config
1982     }
1983 
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1984     fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1985         // When guest write config register at the first time, start worker thread
1986         if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1987             self.start_work_thread();
1988         };
1989 
1990         #[cfg(feature = "direct")]
1991         if self.supports_coordinated_pm
1992             && reg_idx == CLASS_REG
1993             && offset == CLASS_REG_REVISION_ID_OFFSET as u64
1994             && data.len() == 1
1995         {
1996             // HACK
1997             // Byte writes to the "Revision ID" register are interpreted as PM
1998             // op calls
1999             if let Err(e) = VfioPciDevice::op_call(&self.sysfs_path, data[0]) {
2000                 error!("Failed to perform op call: {}", e);
2001             }
2002             return;
2003         }
2004 
2005         let start = (reg_idx * 4) as u64 + offset;
2006 
2007         if let Some(pm_cap) = self.pm_cap.as_mut() {
2008             let mut pm_cap = pm_cap.lock();
2009             if pm_cap.is_pm_reg(start as u32) {
2010                 pm_cap.write(start, data);
2011             }
2012         }
2013 
2014         let mut msi_change: Option<VfioMsiChange> = None;
2015         if let Some(msi_cap) = self.msi_cap.as_mut() {
2016             if msi_cap.is_msi_reg(start, data.len()) {
2017                 msi_change = msi_cap.write_msi_reg(start, data);
2018             }
2019         }
2020 
2021         match msi_change {
2022             Some(VfioMsiChange::Enable) => self.enable_msi(),
2023             Some(VfioMsiChange::Disable) => self.disable_msi(),
2024             _ => (),
2025         }
2026 
2027         msi_change = None;
2028         if let Some(msix_cap) = &self.msix_cap {
2029             let mut msix_cap = msix_cap.lock();
2030             if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
2031                 msi_change = msix_cap.write_msix_control(data);
2032             }
2033         }
2034 
2035         match msi_change {
2036             Some(VfioMsiChange::Enable) => self.enable_msix(),
2037             Some(VfioMsiChange::Disable) => self.disable_msix(),
2038             Some(VfioMsiChange::FunctionChanged) => {
2039                 if let Err(e) = self.msix_vectors_update() {
2040                     error!("update msix vectors failed: {}", e);
2041                 }
2042             }
2043             _ => (),
2044         }
2045 
2046         if !self.is_skipped_reg(start as u32) {
2047             self.device
2048                 .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, start);
2049         }
2050 
2051         // if guest enable memory access, then enable bar mappable once
2052         if start == PCI_COMMAND as u64
2053             && data.len() == 2
2054             && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
2055         {
2056             self.commit_bars_mmap();
2057         } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
2058             let bar_idx = (start as u32 - 0x10) / 4;
2059             let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
2060             let val = u32::from_le_bytes(value);
2061             let mut modify = false;
2062             for region in self.mmio_regions.iter_mut() {
2063                 if region.bar_index() == bar_idx as usize {
2064                     let old_addr = region.address();
2065                     let new_addr = val & 0xFFFFFFF0;
2066                     if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
2067                         // Change 32bit bar address
2068                         *region = region.set_address(u64::from(new_addr));
2069                         modify = true;
2070                     } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
2071                         // Change 64bit bar low address
2072                         *region =
2073                             region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
2074                         modify = true;
2075                     }
2076                     break;
2077                 } else if region.is_64bit_memory()
2078                     && ((bar_idx % 2) == 1)
2079                     && (region.bar_index() + 1 == bar_idx as usize)
2080                 {
2081                     // Change 64bit bar high address
2082                     let old_addr = region.address();
2083                     if val != (old_addr >> 32) as u32 {
2084                         let mut new_addr = (u64::from(val)) << 32;
2085                         new_addr |= old_addr & 0xFFFFFFFF;
2086                         *region = region.set_address(new_addr);
2087                         modify = true;
2088                     }
2089                     break;
2090                 }
2091             }
2092             if modify {
2093                 // if bar is changed under memory enabled, mmap the
2094                 // new bar immediately.
2095                 let cmd = self.config.read_config::<u8>(PCI_COMMAND);
2096                 if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
2097                     self.commit_bars_mmap();
2098                 }
2099             }
2100         }
2101     }
2102 
read_virtual_config_register(&self, reg_idx: usize) -> u322103     fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
2104         warn!(
2105             "{} read unsupported register {}",
2106             self.debug_label(),
2107             reg_idx
2108         );
2109         0
2110     }
2111 
write_virtual_config_register(&mut self, reg_idx: usize, value: u32)2112     fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
2113         match reg_idx {
2114             0 => {
2115                 match value {
2116                     0 => {
2117                         if let Some(pm_evt) =
2118                             self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
2119                         {
2120                             let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
2121                         } else {
2122                             let _ = self.device.pm_low_power_enter();
2123                         }
2124                     }
2125                     _ => {
2126                         let _ = self.device.pm_low_power_exit();
2127                     }
2128                 };
2129             }
2130             _ => warn!(
2131                 "{} write unsupported register {}",
2132                 self.debug_label(),
2133                 reg_idx
2134             ),
2135         };
2136     }
2137 
read_bar(&mut self, addr: u64, data: &mut [u8])2138     fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
2139         if let Some(mmio_info) = self.find_region(addr) {
2140             let offset = addr - mmio_info.address();
2141             let bar_index = mmio_info.bar_index() as u32;
2142             if let Some(msix_cap) = &self.msix_cap {
2143                 let msix_cap = msix_cap.lock();
2144                 if msix_cap.is_msix_table(bar_index, offset) {
2145                     msix_cap.read_table(offset, data);
2146                     return;
2147                 } else if msix_cap.is_msix_pba(bar_index, offset) {
2148                     msix_cap.read_pba(offset, data);
2149                     return;
2150                 }
2151             }
2152             self.device.region_read(bar_index, data, offset);
2153         }
2154     }
2155 
write_bar(&mut self, addr: u64, data: &[u8])2156     fn write_bar(&mut self, addr: u64, data: &[u8]) {
2157         if let Some(mmio_info) = self.find_region(addr) {
2158             // Ignore igd opregion's write
2159             if let Some(device_data) = &self.device_data {
2160                 match *device_data {
2161                     DeviceData::IntelGfxData { opregion_index } => {
2162                         if opregion_index == mmio_info.bar_index() as u32 {
2163                             return;
2164                         }
2165                     }
2166                 }
2167             }
2168 
2169             let offset = addr - mmio_info.address();
2170             let bar_index = mmio_info.bar_index() as u32;
2171 
2172             if let Some(msix_cap) = &self.msix_cap {
2173                 let mut msix_cap = msix_cap.lock();
2174                 if msix_cap.is_msix_table(bar_index, offset) {
2175                     let behavior = msix_cap.write_table(offset, data);
2176                     if let MsixStatus::EntryChanged(index) = behavior {
2177                         let irqfd = msix_cap.get_msix_irqfd(index);
2178                         self.msix_vector_update(index, irqfd);
2179                     }
2180                     return;
2181                 } else if msix_cap.is_msix_pba(bar_index, offset) {
2182                     msix_cap.write_pba(offset, data);
2183                     return;
2184                 }
2185             }
2186 
2187             #[cfg(feature = "direct")]
2188             if self.is_intel_lpss
2189                 && bar_index == 0
2190                 && offset >= LPSS_MANATEE_OFFSET
2191                 && offset < LPSS_MANATEE_OFFSET + LPSS_MANATEE_SIZE
2192             {
2193                 if offset != LPSS_MANATEE_OFFSET {
2194                     warn!(
2195                         "{} write_bar invalid offset 0x{:x}",
2196                         self.debug_label(),
2197                         offset,
2198                     );
2199                     return;
2200                 }
2201 
2202                 let val = if let Ok(bytes) = data.try_into() {
2203                     u64::from_le_bytes(bytes)
2204                 } else {
2205                     warn!(
2206                         "{} write_bar invalid len 0x{:x}",
2207                         self.debug_label(),
2208                         data.len()
2209                     );
2210                     return;
2211                 };
2212                 let addr = val as u16;
2213                 let id = (val >> 32) as u8;
2214 
2215                 match self.i2c_devs.get(&addr) {
2216                     Some(path) => {
2217                         if let Err(e) = VfioPciDevice::op_call(path, id) {
2218                             error!("{} Failed to perform op call: {}", self.debug_label(), e);
2219                         }
2220                     }
2221                     None => {
2222                         warn!(
2223                             "{} write_bar addr 0x{:x} id 0x{:x} not found",
2224                             self.debug_label(),
2225                             addr,
2226                             id
2227                         );
2228                     }
2229                 }
2230                 return;
2231             }
2232             self.device.region_write(bar_index, data, offset);
2233         }
2234     }
2235 
destroy_device(&mut self)2236     fn destroy_device(&mut self) {
2237         self.close();
2238     }
2239 
generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>)2240     fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2241         let mut amls = Vec::new();
2242         let mut shm = None;
2243         if let Some(pci_address) = self.pci_address {
2244             let vcfg_offset = pci_address.to_config_address(0, 13);
2245             if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2246                 vcfg_register.to_aml_bytes(&mut amls);
2247                 shm = vcfg_register
2248                     .create_shm_mmap()
2249                     .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2250                 self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2251                 // All vfio-pci devices should have virtual _PRx method, otherwise
2252                 // host couldn't know whether device has enter into suspend state,
2253                 // host would always think it is in active state, so its parent PCIe
2254                 // switch couldn't enter into suspend state.
2255                 PowerResourceMethod {}.to_aml_bytes(&mut amls);
2256             }
2257         }
2258 
2259         (amls, shm)
2260     }
2261 }
2262 
2263 impl Suspendable for VfioPciDevice {
sleep(&mut self) -> anyhow::Result<()>2264     fn sleep(&mut self) -> anyhow::Result<()> {
2265         #[cfg(feature = "direct")]
2266         if self.supports_coordinated_pm {
2267             for (_, i2c_path) in self.i2c_devs.iter() {
2268                 let _ = VfioPciDevice::coordinated_pm(i2c_path, false);
2269             }
2270             let _ = VfioPciDevice::coordinated_pm(&self.sysfs_path, false);
2271         }
2272 
2273         if let Some(worker_thread) = self.worker_thread.take() {
2274             let res = worker_thread.stop();
2275             self.pci_address = Some(res.address);
2276             self.sysfs_path = res.sysfs_path;
2277             self.pm_cap = res.pm_cap;
2278             self.msix_cap = res.msix_cap;
2279             self.vm_socket_vm = Some(res.vm_socket);
2280         }
2281         Ok(())
2282     }
2283 
wake(&mut self) -> anyhow::Result<()>2284     fn wake(&mut self) -> anyhow::Result<()> {
2285         if self.activated {
2286             self.start_work_thread();
2287         }
2288         Ok(())
2289     }
2290 }
2291 
2292 #[cfg(test)]
2293 mod tests {
2294     use resources::AddressRange;
2295 
2296     use super::VfioResourceAllocator;
2297 
2298     #[test]
no_overlap()2299     fn no_overlap() {
2300         // regions [32, 95]
2301         let mut memory =
2302             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2303         memory
2304             .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2305             .unwrap();
2306         memory
2307             .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2308             .unwrap();
2309 
2310         let mut iter = memory.regions.iter();
2311         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2312     }
2313 
2314     #[test]
complete_overlap()2315     fn complete_overlap() {
2316         // regions [32, 95]
2317         let mut memory =
2318             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2319         // regions [32, 47], [64, 95]
2320         memory
2321             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2322             .unwrap();
2323         // regions [64, 95]
2324         memory
2325             .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2326             .unwrap();
2327 
2328         let mut iter = memory.regions.iter();
2329         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2330     }
2331 
2332     #[test]
partial_overlap_one()2333     fn partial_overlap_one() {
2334         // regions [32, 95]
2335         let mut memory =
2336             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2337         // regions [32, 47], [64, 95]
2338         memory
2339             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2340             .unwrap();
2341         // regions [32, 39], [64, 95]
2342         memory
2343             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2344             .unwrap();
2345 
2346         let mut iter = memory.regions.iter();
2347         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2348         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2349     }
2350 
2351     #[test]
partial_overlap_two()2352     fn partial_overlap_two() {
2353         // regions [32, 95]
2354         let mut memory =
2355             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2356         // regions [32, 47], [64, 95]
2357         memory
2358             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2359             .unwrap();
2360         // regions [32, 39], [72, 95]
2361         memory
2362             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2363             .unwrap();
2364 
2365         let mut iter = memory.regions.iter();
2366         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2367         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2368     }
2369 
2370     #[test]
partial_overlap_three()2371     fn partial_overlap_three() {
2372         // regions [32, 95]
2373         let mut memory =
2374             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2375         // regions [32, 39], [48, 95]
2376         memory
2377             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2378             .unwrap();
2379         // regions [32, 39], [48, 63], [72, 95]
2380         memory
2381             .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2382             .unwrap();
2383         // regions [32, 35], [76, 95]
2384         memory
2385             .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2386             .unwrap();
2387 
2388         let mut iter = memory.regions.iter();
2389         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2390         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2391     }
2392 }
2393