1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::cmp::max;
6 use std::cmp::Reverse;
7 use std::collections::BTreeMap;
8 use std::collections::BTreeSet;
9 #[cfg(feature = "direct")]
10 use std::collections::HashMap;
11 use std::fs;
12 use std::path::Path;
13 use std::path::PathBuf;
14 use std::str::FromStr;
15 use std::sync::Arc;
16 use std::u32;
17
18 use acpi_tables::aml::Aml;
19 #[cfg(feature = "direct")]
20 use anyhow::Context;
21 use base::debug;
22 use base::error;
23 use base::pagesize;
24 use base::warn;
25 use base::AsRawDescriptor;
26 use base::AsRawDescriptors;
27 use base::Event;
28 use base::EventToken;
29 use base::MemoryMapping;
30 use base::Protection;
31 use base::RawDescriptor;
32 use base::Tube;
33 use base::WaitContext;
34 use base::WorkerThread;
35 use hypervisor::MemSlot;
36 use resources::AddressRange;
37 use resources::Alloc;
38 use resources::AllocOptions;
39 use resources::MmioType;
40 use resources::SystemAllocator;
41 use sync::Mutex;
42 use vfio_sys::*;
43 use vm_control::HotPlugDeviceInfo;
44 use vm_control::HotPlugDeviceType;
45 use vm_control::VmMemoryDestination;
46 use vm_control::VmMemoryRequest;
47 use vm_control::VmMemoryResponse;
48 use vm_control::VmMemorySource;
49 use vm_control::VmRequest;
50 use vm_control::VmResponse;
51
52 use crate::pci::acpi::DeviceVcfgRegister;
53 use crate::pci::acpi::PowerResourceMethod;
54 use crate::pci::acpi::SHM_OFFSET;
55 use crate::pci::msi::MsiConfig;
56 use crate::pci::msi::MsiStatus;
57 use crate::pci::msi::PCI_MSI_FLAGS;
58 use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
59 use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
60 use crate::pci::msi::PCI_MSI_NEXT_POINTER;
61 use crate::pci::msix::MsixConfig;
62 use crate::pci::msix::MsixStatus;
63 use crate::pci::msix::BITS_PER_PBA_ENTRY;
64 use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
65 use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
66 #[cfg(feature = "direct")]
67 use crate::pci::pci_configuration::CLASS_REG;
68 #[cfg(feature = "direct")]
69 use crate::pci::pci_configuration::CLASS_REG_REVISION_ID_OFFSET;
70 #[cfg(feature = "direct")]
71 use crate::pci::pci_configuration::HEADER_TYPE_REG;
72 use crate::pci::pci_device::BarRange;
73 use crate::pci::pci_device::Error as PciDeviceError;
74 use crate::pci::pci_device::PciDevice;
75 use crate::pci::pci_device::PreferredIrq;
76 use crate::pci::pm::PciPmCap;
77 use crate::pci::pm::PmConfig;
78 use crate::pci::pm::PM_CAP_LENGTH;
79 use crate::pci::PciAddress;
80 use crate::pci::PciBarConfiguration;
81 use crate::pci::PciBarIndex;
82 use crate::pci::PciBarPrefetchable;
83 use crate::pci::PciBarRegionType;
84 use crate::pci::PciCapabilityID;
85 use crate::pci::PciClassCode;
86 use crate::pci::PciId;
87 use crate::pci::PciInterruptPin;
88 use crate::pci::PCI_VENDOR_ID_INTEL;
89 use crate::vfio::VfioDevice;
90 use crate::vfio::VfioError;
91 use crate::vfio::VfioIrqType;
92 use crate::vfio::VfioPciConfig;
93 use crate::IrqLevelEvent;
94 use crate::Suspendable;
95
96 const PCI_VENDOR_ID: u32 = 0x0;
97 const PCI_DEVICE_ID: u32 = 0x2;
98 const PCI_COMMAND: u32 = 0x4;
99 const PCI_COMMAND_MEMORY: u8 = 0x2;
100 const PCI_BASE_CLASS_CODE: u32 = 0x0B;
101 const PCI_INTERRUPT_NUM: u32 = 0x3C;
102 const PCI_INTERRUPT_PIN: u32 = 0x3D;
103
104 const PCI_CAPABILITY_LIST: u32 = 0x34;
105 const PCI_CAP_ID_MSI: u8 = 0x05;
106 const PCI_CAP_ID_MSIX: u8 = 0x11;
107 const PCI_CAP_ID_PM: u8 = 0x01;
108
109 // Size of the standard PCI config space
110 const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
111 // Size of the standard PCIe config space: 4KB
112 const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
113
114 // Extended Capabilities
115 const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
116 const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
117 const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
118 const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
119
120 #[cfg(feature = "direct")]
121 const LPSS_MANATEE_OFFSET: u64 = 0x400;
122 #[cfg(feature = "direct")]
123 const LPSS_MANATEE_SIZE: u64 = 0x400;
124
125 struct VfioPmCap {
126 offset: u32,
127 capabilities: u32,
128 config: PmConfig,
129 }
130
131 impl VfioPmCap {
new(config: &VfioPciConfig, cap_start: u32) -> Self132 fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
133 let mut capabilities: u32 = config.read_config(cap_start);
134 capabilities |= (PciPmCap::default_cap() as u32) << 16;
135 VfioPmCap {
136 offset: cap_start,
137 capabilities,
138 config: PmConfig::new(),
139 }
140 }
141
should_trigger_pme(&mut self) -> bool142 pub fn should_trigger_pme(&mut self) -> bool {
143 self.config.should_trigger_pme()
144 }
145
is_pm_reg(&self, offset: u32) -> bool146 fn is_pm_reg(&self, offset: u32) -> bool {
147 (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
148 }
149
read(&self, offset: u32) -> u32150 pub fn read(&self, offset: u32) -> u32 {
151 let offset = offset - self.offset;
152 if offset == 0 {
153 self.capabilities
154 } else {
155 let mut data = 0;
156 self.config.read(&mut data);
157 data
158 }
159 }
160
write(&mut self, offset: u64, data: &[u8])161 pub fn write(&mut self, offset: u64, data: &[u8]) {
162 let offset = offset - self.offset as u64;
163 if offset >= std::mem::size_of::<u32>() as u64 {
164 let offset = offset - std::mem::size_of::<u32>() as u64;
165 self.config.write(offset, data);
166 }
167 }
168 }
169
170 enum VfioMsiChange {
171 Disable,
172 Enable,
173 FunctionChanged,
174 }
175
176 struct VfioMsiCap {
177 config: MsiConfig,
178 offset: u32,
179 }
180
181 impl VfioMsiCap {
new( config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube, device_id: u32, device_name: String, ) -> Self182 fn new(
183 config: &VfioPciConfig,
184 msi_cap_start: u32,
185 vm_socket_irq: Tube,
186 device_id: u32,
187 device_name: String,
188 ) -> Self {
189 let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
190 let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
191 let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
192
193 VfioMsiCap {
194 config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
195 offset: msi_cap_start,
196 }
197 }
198
is_msi_reg(&self, index: u64, len: usize) -> bool199 fn is_msi_reg(&self, index: u64, len: usize) -> bool {
200 self.config.is_msi_reg(self.offset, index, len)
201 }
202
write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>203 fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
204 let offset = index as u32 - self.offset;
205 match self.config.write_msi_capability(offset, data) {
206 MsiStatus::Enabled => Some(VfioMsiChange::Enable),
207 MsiStatus::Disabled => Some(VfioMsiChange::Disable),
208 MsiStatus::NothingToDo => None,
209 }
210 }
211
get_msi_irqfd(&self) -> Option<&Event>212 fn get_msi_irqfd(&self) -> Option<&Event> {
213 self.config.get_irqfd()
214 }
215
destroy(&mut self)216 fn destroy(&mut self) {
217 self.config.destroy()
218 }
219 }
220
221 // MSI-X registers in MSI-X capability
222 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
223 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
224 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
225 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
226 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
227 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
228 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
229 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
230
231 struct VfioMsixCap {
232 config: MsixConfig,
233 offset: u32,
234 table_size: u16,
235 table_pci_bar: u32,
236 table_offset: u64,
237 table_size_bytes: u64,
238 pba_pci_bar: u32,
239 pba_offset: u64,
240 pba_size_bytes: u64,
241 msix_interrupt_evt: Vec<Event>,
242 }
243
244 impl VfioMsixCap {
new( config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube, pci_id: u32, device_name: String, ) -> Self245 fn new(
246 config: &VfioPciConfig,
247 msix_cap_start: u32,
248 vm_socket_irq: Tube,
249 pci_id: u32,
250 device_name: String,
251 ) -> Self {
252 let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
253 let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
254 let table_pci_bar = table & PCI_MSIX_TABLE_BIR;
255 let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
256 let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
257 let pba_pci_bar = pba & PCI_MSIX_PBA_BIR;
258 let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
259
260 let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
261 if table_pci_bar == pba_pci_bar
262 && pba_offset > table_offset
263 && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
264 {
265 table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
266 }
267
268 let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
269 let pba_size_bytes = ((table_size + BITS_PER_PBA_ENTRY as u64 - 1)
270 / BITS_PER_PBA_ENTRY as u64)
271 * MSIX_PBA_ENTRIES_MODULO;
272 let mut msix_interrupt_evt = Vec::new();
273 for _ in 0..table_size {
274 msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
275 }
276 VfioMsixCap {
277 config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
278 offset: msix_cap_start,
279 table_size: table_size as u16,
280 table_pci_bar,
281 table_offset,
282 table_size_bytes,
283 pba_pci_bar,
284 pba_offset,
285 pba_size_bytes,
286 msix_interrupt_evt,
287 }
288 }
289
290 // only msix control register is writable and need special handle in pci r/w
is_msix_control_reg(&self, offset: u32, size: u32) -> bool291 fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
292 let control_start = self.offset + PCI_MSIX_FLAGS;
293 let control_end = control_start + 2;
294
295 offset < control_end && offset + size > control_start
296 }
297
read_msix_control(&self, data: &mut u32)298 fn read_msix_control(&self, data: &mut u32) {
299 *data = self.config.read_msix_capability(*data);
300 }
301
write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>302 fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
303 let old_enabled = self.config.enabled();
304 let old_masked = self.config.masked();
305
306 self.config
307 .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
308
309 let new_enabled = self.config.enabled();
310 let new_masked = self.config.masked();
311
312 if !old_enabled && new_enabled {
313 Some(VfioMsiChange::Enable)
314 } else if old_enabled && !new_enabled {
315 Some(VfioMsiChange::Disable)
316 } else if new_enabled && old_masked != new_masked {
317 Some(VfioMsiChange::FunctionChanged)
318 } else {
319 None
320 }
321 }
322
is_msix_table(&self, bar_index: u32, offset: u64) -> bool323 fn is_msix_table(&self, bar_index: u32, offset: u64) -> bool {
324 bar_index == self.table_pci_bar
325 && offset >= self.table_offset
326 && offset < self.table_offset + self.table_size_bytes
327 }
328
get_msix_table(&self, bar_index: u32) -> Option<AddressRange>329 fn get_msix_table(&self, bar_index: u32) -> Option<AddressRange> {
330 if bar_index == self.table_pci_bar {
331 AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
332 } else {
333 None
334 }
335 }
336
read_table(&self, offset: u64, data: &mut [u8])337 fn read_table(&self, offset: u64, data: &mut [u8]) {
338 let offset = offset - self.table_offset;
339 self.config.read_msix_table(offset, data);
340 }
341
write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus342 fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
343 let offset = offset - self.table_offset;
344 self.config.write_msix_table(offset, data)
345 }
346
is_msix_pba(&self, bar_index: u32, offset: u64) -> bool347 fn is_msix_pba(&self, bar_index: u32, offset: u64) -> bool {
348 bar_index == self.pba_pci_bar
349 && offset >= self.pba_offset
350 && offset < self.pba_offset + self.pba_size_bytes
351 }
352
get_msix_pba(&self, bar_index: u32) -> Option<AddressRange>353 fn get_msix_pba(&self, bar_index: u32) -> Option<AddressRange> {
354 if bar_index == self.pba_pci_bar {
355 AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
356 } else {
357 None
358 }
359 }
360
read_pba(&self, offset: u64, data: &mut [u8])361 fn read_pba(&self, offset: u64, data: &mut [u8]) {
362 let offset = offset - self.pba_offset;
363 self.config.read_pba_entries(offset, data);
364 }
365
write_pba(&mut self, offset: u64, data: &[u8])366 fn write_pba(&mut self, offset: u64, data: &[u8]) {
367 let offset = offset - self.pba_offset;
368 self.config.write_pba_entries(offset, data);
369 }
370
get_msix_irqfd(&self, index: usize) -> Option<&Event>371 fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
372 let irqfd = self.config.get_irqfd(index);
373 if let Some(fd) = irqfd {
374 if self.msix_vector_masked(index) {
375 Some(&self.msix_interrupt_evt[index])
376 } else {
377 Some(fd)
378 }
379 } else {
380 None
381 }
382 }
383
get_msix_irqfds(&self) -> Vec<Option<&Event>>384 fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
385 let mut irqfds = Vec::new();
386
387 for i in 0..self.table_size {
388 irqfds.push(self.get_msix_irqfd(i as usize));
389 }
390
391 irqfds
392 }
393
table_size(&self) -> usize394 fn table_size(&self) -> usize {
395 self.table_size.into()
396 }
397
clone_msix_evt(&self) -> Vec<Event>398 fn clone_msix_evt(&self) -> Vec<Event> {
399 self.msix_interrupt_evt
400 .iter()
401 .map(|irq| irq.try_clone().unwrap())
402 .collect()
403 }
404
msix_vector_masked(&self, index: usize) -> bool405 fn msix_vector_masked(&self, index: usize) -> bool {
406 !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
407 }
408
trigger(&mut self, index: usize)409 fn trigger(&mut self, index: usize) {
410 self.config.trigger(index as u16);
411 }
412
destroy(&mut self)413 fn destroy(&mut self) {
414 self.config.destroy()
415 }
416 }
417
418 struct VfioResourceAllocator {
419 // The region that is not allocated yet.
420 regions: BTreeSet<AddressRange>,
421 }
422
423 impl VfioResourceAllocator {
424 // Creates a new `VfioResourceAllocator` for managing VFIO resources.
425 // Can return `Err` if `base` + `size` overflows a u64.
426 //
427 // * `base` - The starting address of the range to manage.
428 // * `size` - The size of the address range in bytes.
new(pool: AddressRange) -> Result<Self, PciDeviceError>429 fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
430 if pool.is_empty() {
431 return Err(PciDeviceError::SizeZero);
432 }
433 let mut regions = BTreeSet::new();
434 regions.insert(pool);
435 Ok(VfioResourceAllocator { regions })
436 }
437
internal_allocate_from_slot( &mut self, slot: AddressRange, range: AddressRange, ) -> Result<u64, PciDeviceError>438 fn internal_allocate_from_slot(
439 &mut self,
440 slot: AddressRange,
441 range: AddressRange,
442 ) -> Result<u64, PciDeviceError> {
443 let slot_was_present = self.regions.remove(&slot);
444 assert!(slot_was_present);
445
446 let (before, after) = slot.non_overlapping_ranges(range);
447
448 if !before.is_empty() {
449 self.regions.insert(before);
450 }
451 if !after.is_empty() {
452 self.regions.insert(after);
453 }
454
455 Ok(range.start)
456 }
457
458 // Allocates a range of addresses from the managed region with a minimal alignment.
459 // Overlapping with a previous allocation is _not_ allowed.
460 // Returns allocated address.
allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError>461 fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
462 if size == 0 {
463 return Err(PciDeviceError::SizeZero);
464 }
465 if !alignment.is_power_of_two() {
466 return Err(PciDeviceError::BadAlignment);
467 }
468
469 // finds first region matching alignment and size.
470 let region = self.regions.iter().find(|range| {
471 match range.start % alignment {
472 0 => range.start.checked_add(size - 1),
473 r => range.start.checked_add(size - 1 + alignment - r),
474 }
475 .map_or(false, |end| end <= range.end)
476 });
477
478 match region {
479 Some(&slot) => {
480 let start = match slot.start % alignment {
481 0 => slot.start,
482 r => slot.start + alignment - r,
483 };
484 let end = start + size - 1;
485 let range = AddressRange::from_start_and_end(start, end);
486
487 self.internal_allocate_from_slot(slot, range)
488 }
489 None => Err(PciDeviceError::OutOfSpace),
490 }
491 }
492
493 // Allocates a range of addresses from the managed region with a required location.
494 // Overlapping with a previous allocation is allowed.
allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError>495 fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
496 if range.is_empty() {
497 return Err(PciDeviceError::SizeZero);
498 }
499
500 while let Some(&slot) = self
501 .regions
502 .iter()
503 .find(|avail_range| avail_range.overlaps(range))
504 {
505 let _address = self.internal_allocate_from_slot(slot, range)?;
506 }
507 Ok(())
508 }
509 }
510
511 struct VfioPciWorker {
512 address: PciAddress,
513 sysfs_path: PathBuf,
514 vm_socket: Tube,
515 name: String,
516 pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
517 msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
518 }
519
520 impl VfioPciWorker {
run( &mut self, req_irq_evt: Event, wakeup_evt: Event, kill_evt: Event, msix_evt: Vec<Event>, )521 fn run(
522 &mut self,
523 req_irq_evt: Event,
524 wakeup_evt: Event,
525 kill_evt: Event,
526 msix_evt: Vec<Event>,
527 ) {
528 #[derive(EventToken)]
529 enum Token {
530 ReqIrq,
531 WakeUp,
532 Kill,
533 MsixIrqi { index: usize },
534 }
535
536 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
537 (&req_irq_evt, Token::ReqIrq),
538 (&wakeup_evt, Token::WakeUp),
539 (&kill_evt, Token::Kill),
540 ]) {
541 Ok(pc) => pc,
542 Err(e) => {
543 error!(
544 "{} failed creating vfio WaitContext: {}",
545 self.name.clone(),
546 e
547 );
548 return;
549 }
550 };
551
552 for (index, msix_int) in msix_evt.iter().enumerate() {
553 wait_ctx
554 .add(msix_int, Token::MsixIrqi { index })
555 .expect("Failed to create vfio WaitContext for msix interrupt event")
556 }
557
558 'wait: loop {
559 let events = match wait_ctx.wait() {
560 Ok(v) => v,
561 Err(e) => {
562 error!("{} failed polling vfio events: {}", self.name.clone(), e);
563 break;
564 }
565 };
566
567 for event in events.iter().filter(|e| e.is_readable) {
568 match event.token {
569 Token::MsixIrqi { index } => {
570 if let Some(msix_cap) = &self.msix_cap {
571 msix_cap.lock().trigger(index);
572 }
573 }
574 Token::ReqIrq => {
575 let device = HotPlugDeviceInfo {
576 device_type: HotPlugDeviceType::EndPoint,
577 path: self.sysfs_path.clone(),
578 hp_interrupt: false,
579 };
580
581 let request = VmRequest::HotPlugCommand { device, add: false };
582 if self.vm_socket.send(&request).is_ok() {
583 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
584 error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
585 } else {
586 break 'wait;
587 }
588 }
589 }
590 Token::WakeUp => {
591 let _ = wakeup_evt.wait();
592 if let Some(pm_cap) = &self.pm_cap {
593 if pm_cap.lock().should_trigger_pme() {
594 let request = VmRequest::PciPme(self.address.pme_requester_id());
595 if self.vm_socket.send(&request).is_ok() {
596 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
597 error!("{} failed to send PME: {}", self.name.clone(), e);
598 }
599 }
600 }
601 }
602 }
603 Token::Kill => break 'wait,
604 }
605 }
606 }
607 }
608 }
609
get_next_from_extcap_header(cap_header: u32) -> u32610 fn get_next_from_extcap_header(cap_header: u32) -> u32 {
611 (cap_header >> 20) & 0xffc
612 }
613
is_skipped_ext_cap(cap_id: u16) -> bool614 fn is_skipped_ext_cap(cap_id: u16) -> bool {
615 matches!(
616 cap_id,
617 // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
618 PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
619 )
620 }
621
622 enum DeviceData {
623 IntelGfxData { opregion_index: u32 },
624 }
625
626 /// PCI Express Extended Capabilities information
627 #[derive(Copy, Clone)]
628 struct ExtCap {
629 /// cap offset in Configuration Space
630 offset: u32,
631 /// cap size
632 size: u32,
633 /// next offset, set next non-skipped offset for non-skipped ext cap
634 next: u16,
635 /// whether to be exposed to guest
636 is_skipped: bool,
637 }
638
639 /// Implements the Vfio Pci device, then a pci device is added into vm
640 pub struct VfioPciDevice {
641 device: Arc<VfioDevice>,
642 config: VfioPciConfig,
643 hotplug: bool,
644 hotplug_bus_number: Option<u8>,
645 preferred_address: PciAddress,
646 pci_address: Option<PciAddress>,
647 interrupt_evt: Option<IrqLevelEvent>,
648 mmio_regions: Vec<PciBarConfiguration>,
649 io_regions: Vec<PciBarConfiguration>,
650 pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
651 msi_cap: Option<VfioMsiCap>,
652 msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
653 irq_type: Option<VfioIrqType>,
654 vm_socket_mem: Tube,
655 device_data: Option<DeviceData>,
656 pm_evt: Option<Event>,
657 worker_thread: Option<WorkerThread<VfioPciWorker>>,
658 vm_socket_vm: Option<Tube>,
659 sysfs_path: PathBuf,
660 #[cfg(feature = "direct")]
661 header_type_reg: Option<u32>,
662 // PCI Express Extended Capabilities
663 ext_caps: Vec<ExtCap>,
664 #[cfg(feature = "direct")]
665 is_intel_lpss: bool,
666 #[cfg(feature = "direct")]
667 supports_coordinated_pm: bool,
668 #[cfg(feature = "direct")]
669 i2c_devs: HashMap<u16, PathBuf>,
670 vcfg_shm_mmap: Option<MemoryMapping>,
671 mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<MemSlot>)>,
672 activated: bool,
673 }
674
675 #[cfg(feature = "direct")]
iter_dir_starts_with( path: &Path, start: &'static str, ) -> anyhow::Result<impl Iterator<Item = fs::DirEntry>>676 fn iter_dir_starts_with(
677 path: &Path,
678 start: &'static str,
679 ) -> anyhow::Result<impl Iterator<Item = fs::DirEntry>> {
680 let dir = fs::read_dir(path)
681 .with_context(|| format!("read_dir call on {} failed", path.to_string_lossy()))?;
682 Ok(dir
683 .filter_map(|e| e.ok())
684 .filter(|e| e.file_type().map(|f| f.is_dir()).unwrap_or(false))
685 .filter(move |e| e.file_name().to_str().unwrap_or("").starts_with(start)))
686 }
687
688 impl VfioPciDevice {
689 /// Constructs a new Vfio Pci device for the give Vfio device
new( sysfs_path: &Path, device: VfioDevice, hotplug: bool, hotplug_bus_number: Option<u8>, guest_address: Option<PciAddress>, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vfio_device_socket_mem: Tube, vfio_device_socket_vm: Tube, #[cfg(feature = "direct")] is_intel_lpss: bool, ) -> Result<Self, PciDeviceError>690 pub fn new(
691 sysfs_path: &Path,
692 device: VfioDevice,
693 hotplug: bool,
694 hotplug_bus_number: Option<u8>,
695 guest_address: Option<PciAddress>,
696 vfio_device_socket_msi: Tube,
697 vfio_device_socket_msix: Tube,
698 vfio_device_socket_mem: Tube,
699 vfio_device_socket_vm: Tube,
700 #[cfg(feature = "direct")] is_intel_lpss: bool,
701 ) -> Result<Self, PciDeviceError> {
702 let preferred_address = if let Some(bus_num) = hotplug_bus_number {
703 debug!("hotplug bus {}", bus_num);
704 PciAddress {
705 // Caller specify pcie bus number for hotplug device
706 bus: bus_num,
707 // devfn should be 0, otherwise pcie root port couldn't detect it
708 dev: 0,
709 func: 0,
710 }
711 } else if let Some(guest_address) = guest_address {
712 debug!("guest PCI address {}", guest_address);
713 guest_address
714 } else {
715 let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
716 PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
717 })?;
718 debug!("parsed device PCI address {}", addr);
719 addr
720 };
721
722 let dev = Arc::new(device);
723 let config = VfioPciConfig::new(Arc::clone(&dev));
724 let mut msi_socket = Some(vfio_device_socket_msi);
725 let mut msix_socket = Some(vfio_device_socket_msix);
726 let mut msi_cap: Option<VfioMsiCap> = None;
727 let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
728 let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
729
730 let mut is_pcie = false;
731 let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
732 let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
733 let device_id: u16 = config.read_config(PCI_DEVICE_ID);
734
735 let pci_id = PciId::new(vendor_id, device_id);
736
737 while cap_next != 0 {
738 let cap_id: u8 = config.read_config(cap_next);
739 if cap_id == PCI_CAP_ID_PM {
740 pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
741 } else if cap_id == PCI_CAP_ID_MSI {
742 if let Some(msi_socket) = msi_socket.take() {
743 msi_cap = Some(VfioMsiCap::new(
744 &config,
745 cap_next,
746 msi_socket,
747 pci_id.into(),
748 dev.device_name().to_string(),
749 ));
750 }
751 } else if cap_id == PCI_CAP_ID_MSIX {
752 if let Some(msix_socket) = msix_socket.take() {
753 msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
754 &config,
755 cap_next,
756 msix_socket,
757 pci_id.into(),
758 dev.device_name().to_string(),
759 ))));
760 }
761 } else if cap_id == PciCapabilityID::PciExpress as u8 {
762 is_pcie = true;
763 }
764 let offset = cap_next + PCI_MSI_NEXT_POINTER;
765 cap_next = config.read_config::<u8>(offset).into();
766 }
767
768 let mut ext_caps: Vec<ExtCap> = Vec::new();
769 if is_pcie {
770 let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
771 while ext_cap_next != 0 {
772 let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
773 if ext_cap_config == 0 {
774 break;
775 }
776 ext_caps.push(ExtCap {
777 offset: ext_cap_next,
778 // Calculate the size later
779 size: 0,
780 // init as the real value
781 next: get_next_from_extcap_header(ext_cap_config) as u16,
782 is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
783 });
784 ext_cap_next = get_next_from_extcap_header(ext_cap_config);
785 }
786
787 // Manage extended caps
788 //
789 // Extended capabilities are chained with each pointing to the next, so
790 // we can drop anything other than the head of the chain simply by
791 // modifying the previous next pointer. For the head of the chain, we
792 // can modify the capability ID to something that cannot match a valid
793 // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
794 // supported.
795 //
796 // reverse order by offset
797 ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
798 let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
799 let mut non_skipped_next: u16 = 0;
800 for ext_cap in ext_caps.iter_mut() {
801 if !ext_cap.is_skipped {
802 ext_cap.next = non_skipped_next;
803 non_skipped_next = ext_cap.offset as u16;
804 } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
805 ext_cap.next = non_skipped_next;
806 }
807 ext_cap.size = next_offset - ext_cap.offset;
808 next_offset = ext_cap.offset;
809 }
810 // order by offset
811 ext_caps.reverse();
812 }
813
814 let class_code: u8 = config.read_config(PCI_BASE_CLASS_CODE);
815
816 let is_intel_gfx = vendor_id == PCI_VENDOR_ID_INTEL
817 && class_code == PciClassCode::DisplayController.get_register_value();
818 let device_data = if is_intel_gfx {
819 Some(DeviceData::IntelGfxData {
820 opregion_index: u32::max_value(),
821 })
822 } else {
823 None
824 };
825
826 #[cfg(feature = "direct")]
827 let mut i2c_devs: HashMap<u16, PathBuf> = HashMap::new();
828
829 #[cfg(feature = "direct")]
830 let (supports_coordinated_pm, header_type_reg) =
831 match VfioPciDevice::coordinated_pm(sysfs_path, true) {
832 Ok(_) => {
833 if is_intel_lpss {
834 if let Err(e) = VfioPciDevice::coordinated_pm_i2c(sysfs_path, &mut i2c_devs)
835 {
836 warn!("coordinated_pm_i2c not supported: {}", e);
837 for (_, i2c_path) in i2c_devs.iter() {
838 let _ = VfioPciDevice::coordinated_pm(i2c_path, false);
839 }
840 i2c_devs.clear();
841 }
842 }
843
844 // Cache the dword at offset 0x0c (cacheline size, latency timer,
845 // header type, BIST).
846 // When using the "direct" feature, this dword can be accessed for
847 // device power state. Directly accessing a device's physical PCI
848 // config space in D3cold state causes a hang. We treat the cacheline
849 // size, latency timer and header type field as immutable in the
850 // guest.
851 let reg: u32 = config.read_config((HEADER_TYPE_REG as u32) * 4);
852 (true, Some(reg))
853 }
854 Err(e) => {
855 warn!("coordinated_pm not supported: {}", e);
856 (false, None)
857 }
858 };
859
860 Ok(VfioPciDevice {
861 device: dev,
862 config,
863 hotplug,
864 hotplug_bus_number,
865 preferred_address,
866 pci_address: None,
867 interrupt_evt: None,
868 mmio_regions: Vec::new(),
869 io_regions: Vec::new(),
870 pm_cap,
871 msi_cap,
872 msix_cap,
873 irq_type: None,
874 vm_socket_mem: vfio_device_socket_mem,
875 device_data,
876 pm_evt: None,
877 worker_thread: None,
878 vm_socket_vm: Some(vfio_device_socket_vm),
879 sysfs_path: sysfs_path.to_path_buf(),
880 #[cfg(feature = "direct")]
881 header_type_reg,
882 ext_caps,
883 #[cfg(feature = "direct")]
884 is_intel_lpss,
885 #[cfg(feature = "direct")]
886 supports_coordinated_pm,
887 #[cfg(feature = "direct")]
888 i2c_devs,
889 vcfg_shm_mmap: None,
890 mapped_mmio_bars: BTreeMap::new(),
891 activated: false,
892 })
893 }
894
895 /// Gets the pci address of the device, if one has already been allocated.
pci_address(&self) -> Option<PciAddress>896 pub fn pci_address(&self) -> Option<PciAddress> {
897 self.pci_address
898 }
899
is_intel_gfx(&self) -> bool900 fn is_intel_gfx(&self) -> bool {
901 let mut ret = false;
902
903 if let Some(device_data) = &self.device_data {
904 match *device_data {
905 DeviceData::IntelGfxData { .. } => ret = true,
906 }
907 }
908
909 ret
910 }
911
find_region(&self, addr: u64) -> Option<PciBarConfiguration>912 fn find_region(&self, addr: u64) -> Option<PciBarConfiguration> {
913 for mmio_info in self.mmio_regions.iter() {
914 if addr >= mmio_info.address() && addr < mmio_info.address() + mmio_info.size() {
915 return Some(*mmio_info);
916 }
917 }
918
919 None
920 }
921
enable_intx(&mut self)922 fn enable_intx(&mut self) {
923 if let Some(ref interrupt_evt) = self.interrupt_evt {
924 if let Err(e) = self.device.irq_enable(
925 &[Some(interrupt_evt.get_trigger())],
926 VFIO_PCI_INTX_IRQ_INDEX,
927 0,
928 ) {
929 error!("{} Intx enable failed: {}", self.debug_label(), e);
930 return;
931 }
932 if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
933 error!("{} Intx mask failed: {}", self.debug_label(), e);
934 self.disable_intx();
935 return;
936 }
937 if let Err(e) = self
938 .device
939 .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
940 {
941 error!("{} resample enable failed: {}", self.debug_label(), e);
942 self.disable_intx();
943 return;
944 }
945 if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
946 error!("{} Intx unmask failed: {}", self.debug_label(), e);
947 self.disable_intx();
948 return;
949 }
950 self.irq_type = Some(VfioIrqType::Intx);
951 }
952 }
953
disable_intx(&mut self)954 fn disable_intx(&mut self) {
955 if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
956 error!("{} Intx disable failed: {}", self.debug_label(), e);
957 }
958 self.irq_type = None;
959 }
960
disable_irqs(&mut self)961 fn disable_irqs(&mut self) {
962 match self.irq_type {
963 Some(VfioIrqType::Msi) => self.disable_msi(),
964 Some(VfioIrqType::Msix) => self.disable_msix(),
965 _ => (),
966 }
967
968 // Above disable_msi() or disable_msix() will enable intx again.
969 // so disable_intx here again.
970 if let Some(VfioIrqType::Intx) = self.irq_type {
971 self.disable_intx();
972 }
973 }
974
enable_msi(&mut self)975 fn enable_msi(&mut self) {
976 self.disable_irqs();
977
978 let irqfd = match &self.msi_cap {
979 Some(cap) => {
980 if let Some(fd) = cap.get_msi_irqfd() {
981 fd
982 } else {
983 self.enable_intx();
984 return;
985 }
986 }
987 None => {
988 self.enable_intx();
989 return;
990 }
991 };
992
993 if let Err(e) = self
994 .device
995 .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
996 {
997 error!("{} failed to enable msi: {}", self.debug_label(), e);
998 self.enable_intx();
999 return;
1000 }
1001
1002 self.irq_type = Some(VfioIrqType::Msi);
1003 }
1004
disable_msi(&mut self)1005 fn disable_msi(&mut self) {
1006 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
1007 error!("{} failed to disable msi: {}", self.debug_label(), e);
1008 return;
1009 }
1010 self.irq_type = None;
1011
1012 self.enable_intx();
1013 }
1014
enable_msix(&mut self)1015 fn enable_msix(&mut self) {
1016 if self.msix_cap.is_none() {
1017 return;
1018 }
1019
1020 self.disable_irqs();
1021 let cap = self.msix_cap.as_ref().unwrap().lock();
1022 let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1023
1024 let mut failed = false;
1025 if !vector_in_use {
1026 // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1027 // to vector 0. Then we enable it and immediately disable it, so that vfio will
1028 // activate physical device. If there are available msix vectors, just enable them
1029 // instead.
1030 let fd = Event::new().expect("failed to create event");
1031 let table_size = cap.table_size();
1032 let mut irqfds = vec![None; table_size];
1033 irqfds[0] = Some(&fd);
1034 for fd in irqfds.iter_mut().skip(1) {
1035 *fd = None;
1036 }
1037 if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1038 error!("{} failed to enable msix: {}", self.debug_label(), e);
1039 failed = true;
1040 }
1041 irqfds[0] = None;
1042 if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1043 error!("{} failed to enable msix: {}", self.debug_label(), e);
1044 failed = true;
1045 }
1046 } else {
1047 let result = self
1048 .device
1049 .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1050 if let Err(e) = result {
1051 error!("{} failed to enable msix: {}", self.debug_label(), e);
1052 failed = true;
1053 }
1054 }
1055
1056 std::mem::drop(cap);
1057 if failed {
1058 self.enable_intx();
1059 return;
1060 }
1061 self.irq_type = Some(VfioIrqType::Msix);
1062 }
1063
disable_msix(&mut self)1064 fn disable_msix(&mut self) {
1065 if self.msix_cap.is_none() {
1066 return;
1067 }
1068 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1069 error!("{} failed to disable msix: {}", self.debug_label(), e);
1070 return;
1071 }
1072 self.irq_type = None;
1073 self.enable_intx();
1074 }
1075
msix_vectors_update(&self) -> Result<(), VfioError>1076 fn msix_vectors_update(&self) -> Result<(), VfioError> {
1077 if let Some(cap) = &self.msix_cap {
1078 self.device
1079 .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1080 }
1081 Ok(())
1082 }
1083
msix_vector_update(&self, index: usize, irqfd: Option<&Event>)1084 fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1085 if let Err(e) = self
1086 .device
1087 .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1088 {
1089 error!(
1090 "{} failed to update msix vector {}: {}",
1091 self.debug_label(),
1092 index,
1093 e
1094 );
1095 }
1096 }
1097
adjust_bar_mmap( &self, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, remove_mmaps: &[AddressRange], ) -> Vec<vfio_region_sparse_mmap_area>1098 fn adjust_bar_mmap(
1099 &self,
1100 bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1101 remove_mmaps: &[AddressRange],
1102 ) -> Vec<vfio_region_sparse_mmap_area> {
1103 let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1104 let pgmask = (pagesize() as u64) - 1;
1105
1106 for mmap in bar_mmaps.iter() {
1107 let mmap_range = if let Some(mmap_range) =
1108 AddressRange::from_start_and_size(mmap.offset as u64, mmap.size as u64)
1109 {
1110 mmap_range
1111 } else {
1112 continue;
1113 };
1114 let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1115 Ok(a) => a,
1116 Err(e) => {
1117 error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1118 mmaps.clear();
1119 return mmaps;
1120 }
1121 };
1122
1123 for &(mut remove_range) in remove_mmaps.iter() {
1124 remove_range = remove_range.intersect(mmap_range);
1125 if !remove_range.is_empty() {
1126 // align offsets to page size
1127 let begin = remove_range.start & !pgmask;
1128 let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1129 let remove_range = AddressRange::from_start_and_end(begin, end);
1130 if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1131 error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1132 }
1133 }
1134 }
1135
1136 for mmap in to_mmap.regions {
1137 mmaps.push(vfio_region_sparse_mmap_area {
1138 offset: mmap.start,
1139 size: mmap.end - mmap.start + 1,
1140 });
1141 }
1142 }
1143
1144 mmaps
1145 }
1146
remove_bar_mmap_msix( &self, bar_index: u32, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1147 fn remove_bar_mmap_msix(
1148 &self,
1149 bar_index: u32,
1150 bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1151 ) -> Vec<vfio_region_sparse_mmap_area> {
1152 let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1153 let mut msix_regions = Vec::new();
1154
1155 if let Some(t) = msix_cap.get_msix_table(bar_index) {
1156 msix_regions.push(t);
1157 }
1158 if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1159 msix_regions.push(p);
1160 }
1161
1162 if msix_regions.is_empty() {
1163 return bar_mmaps;
1164 }
1165
1166 self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1167 }
1168
1169 #[cfg(feature = "direct")]
remove_bar_mmap_lpss( &self, bar_index: u32, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1170 fn remove_bar_mmap_lpss(
1171 &self,
1172 bar_index: u32,
1173 bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1174 ) -> Vec<vfio_region_sparse_mmap_area> {
1175 // must be BAR0
1176 if bar_index != 0 {
1177 return bar_mmaps;
1178 }
1179
1180 match AddressRange::from_start_and_size(LPSS_MANATEE_OFFSET, LPSS_MANATEE_SIZE) {
1181 Some(lpss_range) => self.adjust_bar_mmap(bar_mmaps, &[lpss_range]),
1182 None => bar_mmaps,
1183 }
1184 }
1185
add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemSlot>1186 fn add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemSlot> {
1187 let mut mmaps_slots: Vec<MemSlot> = Vec::new();
1188 if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1189 // the bar storing msix table and pba couldn't mmap.
1190 // these bars should be trapped, so that msix could be emulated.
1191 let mut mmaps = self.device.get_region_mmap(index);
1192
1193 if self.msix_cap.is_some() {
1194 mmaps = self.remove_bar_mmap_msix(index, mmaps);
1195 }
1196 #[cfg(feature = "direct")]
1197 if self.is_intel_lpss {
1198 mmaps = self.remove_bar_mmap_lpss(index, mmaps);
1199 }
1200 if mmaps.is_empty() {
1201 return mmaps_slots;
1202 }
1203
1204 for mmap in mmaps.iter() {
1205 let mmap_offset = mmap.offset;
1206 let mmap_size = mmap.size;
1207 let guest_map_start = bar_addr + mmap_offset;
1208 let region_offset = self.device.get_region_offset(index);
1209 let offset = region_offset + mmap_offset;
1210 let descriptor = match self.device.device_file().try_clone() {
1211 Ok(device_file) => device_file.into(),
1212 Err(_) => break,
1213 };
1214 if self
1215 .vm_socket_mem
1216 .send(&VmMemoryRequest::RegisterMemory {
1217 source: VmMemorySource::Descriptor {
1218 descriptor,
1219 offset,
1220 size: mmap_size,
1221 },
1222 dest: VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1223 prot: Protection::read_write(),
1224 })
1225 .is_err()
1226 {
1227 break;
1228 }
1229
1230 let response: VmMemoryResponse = match self.vm_socket_mem.recv() {
1231 Ok(res) => res,
1232 Err(_) => break,
1233 };
1234 match response {
1235 VmMemoryResponse::RegisterMemory { pfn: _, slot } => {
1236 mmaps_slots.push(slot);
1237 }
1238 _ => break,
1239 }
1240 }
1241 }
1242
1243 mmaps_slots
1244 }
1245
remove_bar_mmap(&self, mmap_slots: &[MemSlot])1246 fn remove_bar_mmap(&self, mmap_slots: &[MemSlot]) {
1247 for mmap_slot in mmap_slots {
1248 if self
1249 .vm_socket_mem
1250 .send(&VmMemoryRequest::UnregisterMemory(*mmap_slot))
1251 .is_err()
1252 {
1253 error!("failed to send UnregisterMemory request");
1254 return;
1255 }
1256 if self.vm_socket_mem.recv::<VmMemoryResponse>().is_err() {
1257 error!("failed to receive UnregisterMemory response");
1258 }
1259 }
1260 }
1261
disable_bars_mmap(&mut self)1262 fn disable_bars_mmap(&mut self) {
1263 for (_, (_, mmap_slots)) in self.mapped_mmio_bars.iter() {
1264 self.remove_bar_mmap(mmap_slots);
1265 }
1266 self.mapped_mmio_bars.clear();
1267 }
1268
commit_bars_mmap(&mut self)1269 fn commit_bars_mmap(&mut self) {
1270 // Unmap all bars before remapping bars, to prevent issues with overlap
1271 let mut needs_map = Vec::new();
1272 for mmio_info in self.mmio_regions.iter() {
1273 let bar_idx = mmio_info.bar_index();
1274 let addr = mmio_info.address();
1275
1276 if let Some((cur_addr, slots)) = self.mapped_mmio_bars.remove(&bar_idx) {
1277 if cur_addr == addr {
1278 self.mapped_mmio_bars.insert(bar_idx, (cur_addr, slots));
1279 continue;
1280 } else {
1281 self.remove_bar_mmap(&slots);
1282 }
1283 }
1284
1285 if addr != 0 {
1286 needs_map.push((bar_idx, addr));
1287 }
1288 }
1289
1290 for (bar_idx, addr) in needs_map.iter() {
1291 let slots = self.add_bar_mmap(*bar_idx as u32, *addr);
1292 self.mapped_mmio_bars.insert(*bar_idx, (*addr, slots));
1293 }
1294 }
1295
close(&mut self)1296 fn close(&mut self) {
1297 if let Some(msi) = self.msi_cap.as_mut() {
1298 msi.destroy();
1299 }
1300 if let Some(msix) = &self.msix_cap {
1301 msix.lock().destroy();
1302 }
1303 self.disable_bars_mmap();
1304 self.device.close();
1305 }
1306
start_work_thread(&mut self)1307 fn start_work_thread(&mut self) {
1308 let vm_socket = match self.vm_socket_vm.take() {
1309 Some(socket) => socket,
1310 None => return,
1311 };
1312
1313 let req_evt = match Event::new() {
1314 Ok(evt) => {
1315 if let Err(e) = self
1316 .device
1317 .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1318 {
1319 error!("{} enable req_irq failed: {}", self.debug_label(), e);
1320 return;
1321 }
1322 evt
1323 }
1324 Err(_) => return,
1325 };
1326
1327 let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1328 Ok(v) => v,
1329 Err(e) => {
1330 error!(
1331 "{} failed creating PM Event pair: {}",
1332 self.debug_label(),
1333 e
1334 );
1335 return;
1336 }
1337 };
1338 self.pm_evt = Some(self_pm_evt);
1339
1340 let mut msix_evt = Vec::new();
1341 if let Some(msix_cap) = &self.msix_cap {
1342 msix_evt = msix_cap.lock().clone_msix_evt();
1343 }
1344
1345 let name = self.device.device_name().to_string();
1346 let address = self.pci_address.expect("Unassigned PCI Address.");
1347 let sysfs_path = self.sysfs_path.clone();
1348 let pm_cap = self.pm_cap.clone();
1349 let msix_cap = self.msix_cap.clone();
1350 self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1351 let mut worker = VfioPciWorker {
1352 address,
1353 sysfs_path,
1354 vm_socket,
1355 name,
1356 pm_cap,
1357 msix_cap,
1358 };
1359 worker.run(req_evt, pm_evt, kill_evt, msix_evt);
1360 worker
1361 }));
1362 self.activated = true;
1363 }
1364
collect_bars(&mut self) -> Vec<PciBarConfiguration>1365 fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1366 let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1367 let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1368
1369 while i <= VFIO_PCI_ROM_REGION_INDEX {
1370 let mut low: u32 = 0xffffffff;
1371 let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1372 0x30
1373 } else {
1374 0x10 + i * 4
1375 };
1376 self.config.write_config(low, offset);
1377 low = self.config.read_config(offset);
1378
1379 let low_flag = low & 0xf;
1380 let is_64bit = low_flag & 0x4 == 0x4;
1381 if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1382 let mut upper: u32 = 0xffffffff;
1383 if is_64bit {
1384 self.config.write_config(upper, offset + 4);
1385 upper = self.config.read_config(offset + 4);
1386 }
1387
1388 low &= 0xffff_fff0;
1389 let mut size: u64 = u64::from(upper);
1390 size <<= 32;
1391 size |= u64::from(low);
1392 size = !size + 1;
1393 let region_type = if is_64bit {
1394 PciBarRegionType::Memory64BitRegion
1395 } else {
1396 PciBarRegionType::Memory32BitRegion
1397 };
1398 let prefetch = if low_flag & 0x8 == 0x8 {
1399 PciBarPrefetchable::Prefetchable
1400 } else {
1401 PciBarPrefetchable::NotPrefetchable
1402 };
1403 mem_bars.push(PciBarConfiguration::new(
1404 i as usize,
1405 size,
1406 region_type,
1407 prefetch,
1408 ));
1409 } else if low_flag & 0x1 == 0x1 {
1410 let size = !(low & 0xffff_fffc) + 1;
1411 self.io_regions.push(PciBarConfiguration::new(
1412 i as usize,
1413 size.into(),
1414 PciBarRegionType::IoRegion,
1415 PciBarPrefetchable::NotPrefetchable,
1416 ));
1417 }
1418
1419 if is_64bit {
1420 i += 2;
1421 } else {
1422 i += 1;
1423 }
1424 }
1425 mem_bars
1426 }
1427
configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64)1428 fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1429 let offset: u32 = bar_info.reg_index() as u32 * 4;
1430 let mmio_region = *bar_info;
1431 self.mmio_regions.push(mmio_region.set_address(bar_addr));
1432
1433 let val: u32 = self.config.read_config(offset);
1434 let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1435 self.config.write_config(low, offset);
1436 if bar_info.is_64bit_memory() {
1437 let upper = (bar_addr >> 32) as u32;
1438 self.config.write_config(upper, offset + 4);
1439 }
1440 }
1441
allocate_root_barmem( &mut self, mem_bars: &[PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1442 fn allocate_root_barmem(
1443 &mut self,
1444 mem_bars: &[PciBarConfiguration],
1445 resources: &mut SystemAllocator,
1446 ) -> Result<Vec<BarRange>, PciDeviceError> {
1447 let address = self.pci_address.unwrap();
1448 let mut ranges: Vec<BarRange> = Vec::new();
1449 for mem_bar in mem_bars {
1450 let bar_size = mem_bar.size();
1451 let mut bar_addr: u64 = 0;
1452 // Don't allocate mmio for hotplug device, OS will allocate it from
1453 // its parent's bridge window.
1454 if !self.hotplug {
1455 bar_addr = resources
1456 .allocate_mmio(
1457 bar_size,
1458 Alloc::PciBar {
1459 bus: address.bus,
1460 dev: address.dev,
1461 func: address.func,
1462 bar: mem_bar.bar_index() as u8,
1463 },
1464 "vfio_bar".to_string(),
1465 AllocOptions::new()
1466 .prefetchable(mem_bar.is_prefetchable())
1467 .max_address(if mem_bar.is_64bit_memory() {
1468 u64::MAX
1469 } else {
1470 u32::MAX.into()
1471 })
1472 .align(bar_size),
1473 )
1474 .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1475 ranges.push(BarRange {
1476 addr: bar_addr,
1477 size: bar_size,
1478 prefetchable: mem_bar.is_prefetchable(),
1479 });
1480 }
1481 self.configure_barmem(mem_bar, bar_addr);
1482 }
1483 Ok(ranges)
1484 }
1485
allocate_nonroot_barmem( &mut self, mem_bars: &mut [PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1486 fn allocate_nonroot_barmem(
1487 &mut self,
1488 mem_bars: &mut [PciBarConfiguration],
1489 resources: &mut SystemAllocator,
1490 ) -> Result<Vec<BarRange>, PciDeviceError> {
1491 const NON_PREFETCHABLE: usize = 0;
1492 const PREFETCHABLE: usize = 1;
1493 const ARRAY_SIZE: usize = 2;
1494 let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1495 let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1496 match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1497 Ok(a) => a,
1498 Err(e) => {
1499 error!(
1500 "{} init nonroot VfioResourceAllocator failed: {}",
1501 self.debug_label(),
1502 e
1503 );
1504 return Err(e);
1505 }
1506 },
1507 match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1508 Ok(a) => a,
1509 Err(e) => {
1510 error!(
1511 "{} init nonroot VfioResourceAllocator failed: {}",
1512 self.debug_label(),
1513 e
1514 );
1515 return Err(e);
1516 }
1517 },
1518 ];
1519 let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1520 // the window must be 1M-aligned as per the PCI spec
1521 let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1522 let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1523
1524 // Descend by bar size, this could reduce allocated size for all the bars.
1525 mem_bars.sort_by_key(|a| Reverse(a.size()));
1526 for mem_bar in mem_bars {
1527 let prefetchable = mem_bar.is_prefetchable();
1528 let is_64bit = mem_bar.is_64bit_memory();
1529
1530 // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1531 // as all the prefetchable bars should be in one region
1532 if prefetchable && !is_64bit {
1533 memtype[PREFETCHABLE] = MmioType::Low;
1534 }
1535 let i = if prefetchable {
1536 PREFETCHABLE
1537 } else {
1538 NON_PREFETCHABLE
1539 };
1540 let bar_size = mem_bar.size();
1541 let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1542 Ok(s) => s,
1543 Err(e) => {
1544 error!(
1545 "{} nonroot allocate_wit_align failed: {}",
1546 self.debug_label(),
1547 e
1548 );
1549 return Err(e);
1550 }
1551 };
1552 window_sz[i] = max(window_sz[i], start + bar_size);
1553 alignment[i] = max(alignment[i], bar_size);
1554 let mem_info = (*mem_bar).set_address(start);
1555 membars[i].push(mem_info);
1556 }
1557
1558 let address = self.pci_address.unwrap();
1559 let mut ranges: Vec<BarRange> = Vec::new();
1560 for (index, bars) in membars.iter().enumerate() {
1561 if bars.is_empty() {
1562 continue;
1563 }
1564
1565 let i = if index == 1 {
1566 PREFETCHABLE
1567 } else {
1568 NON_PREFETCHABLE
1569 };
1570 let mut window_addr: u64 = 0;
1571 // Don't allocate mmio for hotplug device, OS will allocate it from
1572 // its parent's bridge window.
1573 if !self.hotplug {
1574 window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1575 let alloc = if i == NON_PREFETCHABLE {
1576 Alloc::PciBridgeWindow {
1577 bus: address.bus,
1578 dev: address.dev,
1579 func: address.func,
1580 }
1581 } else {
1582 Alloc::PciBridgePrefetchWindow {
1583 bus: address.bus,
1584 dev: address.dev,
1585 func: address.func,
1586 }
1587 };
1588 window_addr = resources
1589 .mmio_allocator(memtype[i])
1590 .allocate_with_align(
1591 window_sz[i],
1592 alloc,
1593 "vfio_bar_window".to_string(),
1594 alignment[i],
1595 )
1596 .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1597 for mem_info in bars {
1598 let bar_addr = window_addr + mem_info.address();
1599 ranges.push(BarRange {
1600 addr: bar_addr,
1601 size: mem_info.size(),
1602 prefetchable: mem_info.is_prefetchable(),
1603 });
1604 }
1605 }
1606
1607 for mem_info in bars {
1608 let bar_addr = window_addr + mem_info.address();
1609 self.configure_barmem(mem_info, bar_addr);
1610 }
1611 }
1612 Ok(ranges)
1613 }
1614
1615 /// Return the supported iova max address of the Vfio Pci device
get_max_iova(&self) -> u641616 pub fn get_max_iova(&self) -> u64 {
1617 self.device.get_max_addr()
1618 }
1619
1620 #[cfg(feature = "direct")]
coordinated_pm(sysfs_path: &Path, enter: bool) -> anyhow::Result<()>1621 fn coordinated_pm(sysfs_path: &Path, enter: bool) -> anyhow::Result<()> {
1622 let path = sysfs_path.join("power/coordinated");
1623 fs::write(&path, if enter { "enter\n" } else { "exit\n" })
1624 .with_context(|| format!("Failed to write to {}", path.to_string_lossy()))
1625 }
1626
1627 #[cfg(feature = "direct")]
coordinated_pm_i2c_adap( adap_path: &Path, i2c_devs: &mut HashMap<u16, PathBuf>, ) -> anyhow::Result<()>1628 fn coordinated_pm_i2c_adap(
1629 adap_path: &Path,
1630 i2c_devs: &mut HashMap<u16, PathBuf>,
1631 ) -> anyhow::Result<()> {
1632 for entry in iter_dir_starts_with(adap_path, "i2c-")? {
1633 let path = adap_path.join(entry.file_name());
1634
1635 VfioPciDevice::coordinated_pm(&path, true)?;
1636
1637 let addr_path = path.join("address");
1638 let addr = fs::read_to_string(&addr_path).with_context(|| {
1639 format!(
1640 "Failed to read to string from {}",
1641 addr_path.to_string_lossy()
1642 )
1643 })?;
1644 let addr = addr.trim_end().parse::<u16>().with_context(|| {
1645 format!(
1646 "Failed to parse {} from {}",
1647 addr,
1648 addr_path.to_string_lossy()
1649 )
1650 })?;
1651
1652 if let Some(c) = i2c_devs.insert(addr, path.to_path_buf()) {
1653 anyhow::bail!(
1654 "Collision encountered: {}, {}",
1655 path.to_string_lossy(),
1656 c.to_string_lossy()
1657 );
1658 }
1659 }
1660 Ok(())
1661 }
1662
1663 #[cfg(feature = "direct")]
coordinated_pm_i2c_platdev( plat_path: &Path, i2c_devs: &mut HashMap<u16, PathBuf>, ) -> anyhow::Result<()>1664 fn coordinated_pm_i2c_platdev(
1665 plat_path: &Path,
1666 i2c_devs: &mut HashMap<u16, PathBuf>,
1667 ) -> anyhow::Result<()> {
1668 for entry in iter_dir_starts_with(plat_path, "i2c-")? {
1669 let path = plat_path.join(entry.file_name());
1670 VfioPciDevice::coordinated_pm_i2c_adap(&path, i2c_devs)?;
1671 }
1672 Ok(())
1673 }
1674
1675 #[cfg(feature = "direct")]
coordinated_pm_i2c( sysfs_path: &Path, i2c_devs: &mut HashMap<u16, PathBuf>, ) -> anyhow::Result<()>1676 fn coordinated_pm_i2c(
1677 sysfs_path: &Path,
1678 i2c_devs: &mut HashMap<u16, PathBuf>,
1679 ) -> anyhow::Result<()> {
1680 for entry in iter_dir_starts_with(sysfs_path, "i2c_designware")? {
1681 let path = sysfs_path.join(entry.file_name());
1682 VfioPciDevice::coordinated_pm_i2c_platdev(&path, i2c_devs)?;
1683 }
1684 Ok(())
1685 }
1686
1687 #[cfg(feature = "direct")]
power_state(&self) -> anyhow::Result<u8>1688 fn power_state(&self) -> anyhow::Result<u8> {
1689 let path = self.sysfs_path.join("power_state");
1690 let state = fs::read_to_string(&path)
1691 .with_context(|| format!("Failed to read from {}", path.to_string_lossy()))?;
1692 match state.as_str() {
1693 "D0\n" => Ok(0),
1694 "D1\n" => Ok(1),
1695 "D2\n" => Ok(2),
1696 "D3hot\n" => Ok(3),
1697 "D3cold\n" => Ok(4),
1698 "unknown\n" => Ok(5),
1699 _ => Err(std::io::Error::new(
1700 std::io::ErrorKind::InvalidData,
1701 "invalid state",
1702 ))?,
1703 }
1704 }
1705
1706 #[cfg(feature = "direct")]
op_call(path: &Path, id: u8) -> anyhow::Result<()>1707 fn op_call(path: &Path, id: u8) -> anyhow::Result<()> {
1708 let path = path.join("power/op_call");
1709 fs::write(&path, &[id])
1710 .with_context(|| format!("Failed to write to {}", path.to_string_lossy()))
1711 }
1712
get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap>1713 fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1714 self.ext_caps
1715 .iter()
1716 .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1717 .cloned()
1718 }
1719
is_skipped_reg(&self, reg: u32) -> bool1720 fn is_skipped_reg(&self, reg: u32) -> bool {
1721 // fast handle for pci config space
1722 if reg < PCI_CONFIG_SPACE_SIZE {
1723 return false;
1724 }
1725
1726 self.get_ext_cap_by_reg(reg)
1727 .map_or(false, |cap| cap.is_skipped)
1728 }
1729 }
1730
1731 impl PciDevice for VfioPciDevice {
debug_label(&self) -> String1732 fn debug_label(&self) -> String {
1733 format!("vfio {} device", self.device.device_name())
1734 }
1735
preferred_address(&self) -> Option<PciAddress>1736 fn preferred_address(&self) -> Option<PciAddress> {
1737 Some(self.preferred_address)
1738 }
1739
allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>1740 fn allocate_address(
1741 &mut self,
1742 resources: &mut SystemAllocator,
1743 ) -> Result<PciAddress, PciDeviceError> {
1744 if self.pci_address.is_none() {
1745 let mut address = self.preferred_address;
1746 while address.func < 8 {
1747 if resources.reserve_pci(
1748 Alloc::PciBar {
1749 bus: address.bus,
1750 dev: address.dev,
1751 func: address.func,
1752 bar: 0,
1753 },
1754 self.debug_label(),
1755 ) {
1756 self.pci_address = Some(address);
1757 break;
1758 } else if self.hotplug_bus_number.is_none() {
1759 break;
1760 } else {
1761 address.func += 1;
1762 }
1763 }
1764 }
1765 self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1766 }
1767
keep_rds(&self) -> Vec<RawDescriptor>1768 fn keep_rds(&self) -> Vec<RawDescriptor> {
1769 let mut rds = self.device.keep_rds();
1770 if let Some(ref interrupt_evt) = self.interrupt_evt {
1771 rds.extend(interrupt_evt.as_raw_descriptors());
1772 }
1773 rds.push(self.vm_socket_mem.as_raw_descriptor());
1774 if let Some(vm_socket_vm) = &self.vm_socket_vm {
1775 rds.push(vm_socket_vm.as_raw_descriptor());
1776 }
1777 if let Some(msi_cap) = &self.msi_cap {
1778 rds.push(msi_cap.config.get_msi_socket());
1779 }
1780 if let Some(msix_cap) = &self.msix_cap {
1781 rds.push(msix_cap.lock().config.as_raw_descriptor());
1782 }
1783 rds
1784 }
1785
preferred_irq(&self) -> PreferredIrq1786 fn preferred_irq(&self) -> PreferredIrq {
1787 // Is INTx configured?
1788 let pin = match self.config.read_config::<u8>(PCI_INTERRUPT_PIN) {
1789 1 => PciInterruptPin::IntA,
1790 2 => PciInterruptPin::IntB,
1791 3 => PciInterruptPin::IntC,
1792 4 => PciInterruptPin::IntD,
1793 _ => return PreferredIrq::None,
1794 };
1795
1796 // TODO: replace sysfs/irq value parsing with vfio interface
1797 // reporting host allocated interrupt number and type.
1798 let path = self.sysfs_path.join("irq");
1799 let gsi = fs::read_to_string(path)
1800 .map(|v| v.trim().parse::<u32>().unwrap_or(0))
1801 .unwrap_or(0);
1802
1803 PreferredIrq::Fixed { pin, gsi }
1804 }
1805
assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32)1806 fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1807 // Keep event/resample event references.
1808 self.interrupt_evt = Some(irq_evt);
1809
1810 // enable INTX
1811 self.enable_intx();
1812
1813 self.config
1814 .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1815 self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1816 }
1817
allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1818 fn allocate_io_bars(
1819 &mut self,
1820 resources: &mut SystemAllocator,
1821 ) -> Result<Vec<BarRange>, PciDeviceError> {
1822 let address = self
1823 .pci_address
1824 .expect("allocate_address must be called prior to allocate_device_bars");
1825
1826 let mut mem_bars = self.collect_bars();
1827
1828 let ranges = if address.bus == 0 {
1829 self.allocate_root_barmem(&mem_bars, resources)?
1830 } else {
1831 self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1832 };
1833
1834 // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1835 // driver doesn't claim this vga device, then xorg couldn't boot up.
1836 if self.is_intel_gfx() {
1837 let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1838 cmd |= PCI_COMMAND_MEMORY;
1839 self.config.write_config(cmd, PCI_COMMAND);
1840 }
1841 Ok(ranges)
1842 }
1843
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1844 fn allocate_device_bars(
1845 &mut self,
1846 resources: &mut SystemAllocator,
1847 ) -> Result<Vec<BarRange>, PciDeviceError> {
1848 let mut ranges: Vec<BarRange> = Vec::new();
1849
1850 if !self.is_intel_gfx() {
1851 return Ok(ranges);
1852 }
1853
1854 // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1855 // then write this gpa into pci cfg register
1856 if let Some((index, size)) = self.device.get_cap_type_info(
1857 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1858 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1859 ) {
1860 let address = self
1861 .pci_address
1862 .expect("allocate_address must be called prior to allocate_device_bars");
1863 let bar_addr = resources
1864 .allocate_mmio(
1865 size,
1866 Alloc::PciBar {
1867 bus: address.bus,
1868 dev: address.dev,
1869 func: address.func,
1870 bar: (index * 4) as u8,
1871 },
1872 "vfio_bar".to_string(),
1873 AllocOptions::new().max_address(u32::MAX.into()),
1874 )
1875 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1876 ranges.push(BarRange {
1877 addr: bar_addr,
1878 size,
1879 prefetchable: false,
1880 });
1881 self.device_data = Some(DeviceData::IntelGfxData {
1882 opregion_index: index,
1883 });
1884
1885 self.mmio_regions.push(
1886 PciBarConfiguration::new(
1887 index as usize,
1888 size,
1889 PciBarRegionType::Memory32BitRegion,
1890 PciBarPrefetchable::NotPrefetchable,
1891 )
1892 .set_address(bar_addr),
1893 );
1894 self.config.write_config(bar_addr as u32, 0xFC);
1895 }
1896
1897 Ok(ranges)
1898 }
1899
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1900 fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1901 for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1902 if region.bar_index() == bar_num {
1903 let command: u8 = self.config.read_config(PCI_COMMAND);
1904 if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1905 return None;
1906 } else {
1907 return Some(*region);
1908 }
1909 }
1910 }
1911
1912 None
1913 }
1914
register_device_capabilities(&mut self) -> Result<(), PciDeviceError>1915 fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1916 Ok(())
1917 }
1918
read_config_register(&self, reg_idx: usize) -> u321919 fn read_config_register(&self, reg_idx: usize) -> u32 {
1920 #[cfg(feature = "direct")]
1921 if reg_idx == HEADER_TYPE_REG {
1922 if let Some(header_type_reg) = self.header_type_reg {
1923 let mut v = header_type_reg.to_le_bytes();
1924 // HACK
1925 // Reads from the "BIST" register are interpreted as device
1926 // PCI power state
1927 v[3] = self.power_state().unwrap_or_else(|e| {
1928 error!("Failed to get device power state: {}", e);
1929 5 // unknown state
1930 });
1931 return u32::from_le_bytes(v);
1932 }
1933 }
1934
1935 let reg: u32 = (reg_idx * 4) as u32;
1936 let mut config: u32 = self.config.read_config(reg);
1937
1938 // See VfioPciDevice::new for details how extended caps are managed
1939 if reg >= PCI_CONFIG_SPACE_SIZE {
1940 let ext_cap = self.get_ext_cap_by_reg(reg);
1941 if let Some(ext_cap) = ext_cap {
1942 if ext_cap.offset == reg {
1943 config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1944 }
1945
1946 if ext_cap.is_skipped {
1947 if reg == PCI_CONFIG_SPACE_SIZE {
1948 config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1949 } else {
1950 config = 0;
1951 }
1952 }
1953 }
1954 }
1955
1956 // Ignore IO bar
1957 if (0x10..=0x24).contains(®) {
1958 let bar_idx = (reg as usize - 0x10) / 4;
1959 if let Some(bar) = self.get_bar_configuration(bar_idx) {
1960 if bar.is_io() {
1961 config = 0;
1962 }
1963 }
1964 } else if let Some(msix_cap) = &self.msix_cap {
1965 let msix_cap = msix_cap.lock();
1966 if msix_cap.is_msix_control_reg(reg, 4) {
1967 msix_cap.read_msix_control(&mut config);
1968 }
1969 } else if let Some(pm_cap) = &self.pm_cap {
1970 let pm_cap = pm_cap.lock();
1971 if pm_cap.is_pm_reg(reg) {
1972 config = pm_cap.read(reg);
1973 }
1974 }
1975
1976 // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1977 if self.is_intel_gfx() && reg == 0x50 {
1978 config &= 0xffff00ff;
1979 }
1980
1981 config
1982 }
1983
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1984 fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1985 // When guest write config register at the first time, start worker thread
1986 if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1987 self.start_work_thread();
1988 };
1989
1990 #[cfg(feature = "direct")]
1991 if self.supports_coordinated_pm
1992 && reg_idx == CLASS_REG
1993 && offset == CLASS_REG_REVISION_ID_OFFSET as u64
1994 && data.len() == 1
1995 {
1996 // HACK
1997 // Byte writes to the "Revision ID" register are interpreted as PM
1998 // op calls
1999 if let Err(e) = VfioPciDevice::op_call(&self.sysfs_path, data[0]) {
2000 error!("Failed to perform op call: {}", e);
2001 }
2002 return;
2003 }
2004
2005 let start = (reg_idx * 4) as u64 + offset;
2006
2007 if let Some(pm_cap) = self.pm_cap.as_mut() {
2008 let mut pm_cap = pm_cap.lock();
2009 if pm_cap.is_pm_reg(start as u32) {
2010 pm_cap.write(start, data);
2011 }
2012 }
2013
2014 let mut msi_change: Option<VfioMsiChange> = None;
2015 if let Some(msi_cap) = self.msi_cap.as_mut() {
2016 if msi_cap.is_msi_reg(start, data.len()) {
2017 msi_change = msi_cap.write_msi_reg(start, data);
2018 }
2019 }
2020
2021 match msi_change {
2022 Some(VfioMsiChange::Enable) => self.enable_msi(),
2023 Some(VfioMsiChange::Disable) => self.disable_msi(),
2024 _ => (),
2025 }
2026
2027 msi_change = None;
2028 if let Some(msix_cap) = &self.msix_cap {
2029 let mut msix_cap = msix_cap.lock();
2030 if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
2031 msi_change = msix_cap.write_msix_control(data);
2032 }
2033 }
2034
2035 match msi_change {
2036 Some(VfioMsiChange::Enable) => self.enable_msix(),
2037 Some(VfioMsiChange::Disable) => self.disable_msix(),
2038 Some(VfioMsiChange::FunctionChanged) => {
2039 if let Err(e) = self.msix_vectors_update() {
2040 error!("update msix vectors failed: {}", e);
2041 }
2042 }
2043 _ => (),
2044 }
2045
2046 if !self.is_skipped_reg(start as u32) {
2047 self.device
2048 .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, start);
2049 }
2050
2051 // if guest enable memory access, then enable bar mappable once
2052 if start == PCI_COMMAND as u64
2053 && data.len() == 2
2054 && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
2055 {
2056 self.commit_bars_mmap();
2057 } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
2058 let bar_idx = (start as u32 - 0x10) / 4;
2059 let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
2060 let val = u32::from_le_bytes(value);
2061 let mut modify = false;
2062 for region in self.mmio_regions.iter_mut() {
2063 if region.bar_index() == bar_idx as usize {
2064 let old_addr = region.address();
2065 let new_addr = val & 0xFFFFFFF0;
2066 if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
2067 // Change 32bit bar address
2068 *region = region.set_address(u64::from(new_addr));
2069 modify = true;
2070 } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
2071 // Change 64bit bar low address
2072 *region =
2073 region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
2074 modify = true;
2075 }
2076 break;
2077 } else if region.is_64bit_memory()
2078 && ((bar_idx % 2) == 1)
2079 && (region.bar_index() + 1 == bar_idx as usize)
2080 {
2081 // Change 64bit bar high address
2082 let old_addr = region.address();
2083 if val != (old_addr >> 32) as u32 {
2084 let mut new_addr = (u64::from(val)) << 32;
2085 new_addr |= old_addr & 0xFFFFFFFF;
2086 *region = region.set_address(new_addr);
2087 modify = true;
2088 }
2089 break;
2090 }
2091 }
2092 if modify {
2093 // if bar is changed under memory enabled, mmap the
2094 // new bar immediately.
2095 let cmd = self.config.read_config::<u8>(PCI_COMMAND);
2096 if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
2097 self.commit_bars_mmap();
2098 }
2099 }
2100 }
2101 }
2102
read_virtual_config_register(&self, reg_idx: usize) -> u322103 fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
2104 warn!(
2105 "{} read unsupported register {}",
2106 self.debug_label(),
2107 reg_idx
2108 );
2109 0
2110 }
2111
write_virtual_config_register(&mut self, reg_idx: usize, value: u32)2112 fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
2113 match reg_idx {
2114 0 => {
2115 match value {
2116 0 => {
2117 if let Some(pm_evt) =
2118 self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
2119 {
2120 let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
2121 } else {
2122 let _ = self.device.pm_low_power_enter();
2123 }
2124 }
2125 _ => {
2126 let _ = self.device.pm_low_power_exit();
2127 }
2128 };
2129 }
2130 _ => warn!(
2131 "{} write unsupported register {}",
2132 self.debug_label(),
2133 reg_idx
2134 ),
2135 };
2136 }
2137
read_bar(&mut self, addr: u64, data: &mut [u8])2138 fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
2139 if let Some(mmio_info) = self.find_region(addr) {
2140 let offset = addr - mmio_info.address();
2141 let bar_index = mmio_info.bar_index() as u32;
2142 if let Some(msix_cap) = &self.msix_cap {
2143 let msix_cap = msix_cap.lock();
2144 if msix_cap.is_msix_table(bar_index, offset) {
2145 msix_cap.read_table(offset, data);
2146 return;
2147 } else if msix_cap.is_msix_pba(bar_index, offset) {
2148 msix_cap.read_pba(offset, data);
2149 return;
2150 }
2151 }
2152 self.device.region_read(bar_index, data, offset);
2153 }
2154 }
2155
write_bar(&mut self, addr: u64, data: &[u8])2156 fn write_bar(&mut self, addr: u64, data: &[u8]) {
2157 if let Some(mmio_info) = self.find_region(addr) {
2158 // Ignore igd opregion's write
2159 if let Some(device_data) = &self.device_data {
2160 match *device_data {
2161 DeviceData::IntelGfxData { opregion_index } => {
2162 if opregion_index == mmio_info.bar_index() as u32 {
2163 return;
2164 }
2165 }
2166 }
2167 }
2168
2169 let offset = addr - mmio_info.address();
2170 let bar_index = mmio_info.bar_index() as u32;
2171
2172 if let Some(msix_cap) = &self.msix_cap {
2173 let mut msix_cap = msix_cap.lock();
2174 if msix_cap.is_msix_table(bar_index, offset) {
2175 let behavior = msix_cap.write_table(offset, data);
2176 if let MsixStatus::EntryChanged(index) = behavior {
2177 let irqfd = msix_cap.get_msix_irqfd(index);
2178 self.msix_vector_update(index, irqfd);
2179 }
2180 return;
2181 } else if msix_cap.is_msix_pba(bar_index, offset) {
2182 msix_cap.write_pba(offset, data);
2183 return;
2184 }
2185 }
2186
2187 #[cfg(feature = "direct")]
2188 if self.is_intel_lpss
2189 && bar_index == 0
2190 && offset >= LPSS_MANATEE_OFFSET
2191 && offset < LPSS_MANATEE_OFFSET + LPSS_MANATEE_SIZE
2192 {
2193 if offset != LPSS_MANATEE_OFFSET {
2194 warn!(
2195 "{} write_bar invalid offset 0x{:x}",
2196 self.debug_label(),
2197 offset,
2198 );
2199 return;
2200 }
2201
2202 let val = if let Ok(bytes) = data.try_into() {
2203 u64::from_le_bytes(bytes)
2204 } else {
2205 warn!(
2206 "{} write_bar invalid len 0x{:x}",
2207 self.debug_label(),
2208 data.len()
2209 );
2210 return;
2211 };
2212 let addr = val as u16;
2213 let id = (val >> 32) as u8;
2214
2215 match self.i2c_devs.get(&addr) {
2216 Some(path) => {
2217 if let Err(e) = VfioPciDevice::op_call(path, id) {
2218 error!("{} Failed to perform op call: {}", self.debug_label(), e);
2219 }
2220 }
2221 None => {
2222 warn!(
2223 "{} write_bar addr 0x{:x} id 0x{:x} not found",
2224 self.debug_label(),
2225 addr,
2226 id
2227 );
2228 }
2229 }
2230 return;
2231 }
2232 self.device.region_write(bar_index, data, offset);
2233 }
2234 }
2235
destroy_device(&mut self)2236 fn destroy_device(&mut self) {
2237 self.close();
2238 }
2239
generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>)2240 fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2241 let mut amls = Vec::new();
2242 let mut shm = None;
2243 if let Some(pci_address) = self.pci_address {
2244 let vcfg_offset = pci_address.to_config_address(0, 13);
2245 if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2246 vcfg_register.to_aml_bytes(&mut amls);
2247 shm = vcfg_register
2248 .create_shm_mmap()
2249 .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2250 self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2251 // All vfio-pci devices should have virtual _PRx method, otherwise
2252 // host couldn't know whether device has enter into suspend state,
2253 // host would always think it is in active state, so its parent PCIe
2254 // switch couldn't enter into suspend state.
2255 PowerResourceMethod {}.to_aml_bytes(&mut amls);
2256 }
2257 }
2258
2259 (amls, shm)
2260 }
2261 }
2262
2263 impl Suspendable for VfioPciDevice {
sleep(&mut self) -> anyhow::Result<()>2264 fn sleep(&mut self) -> anyhow::Result<()> {
2265 #[cfg(feature = "direct")]
2266 if self.supports_coordinated_pm {
2267 for (_, i2c_path) in self.i2c_devs.iter() {
2268 let _ = VfioPciDevice::coordinated_pm(i2c_path, false);
2269 }
2270 let _ = VfioPciDevice::coordinated_pm(&self.sysfs_path, false);
2271 }
2272
2273 if let Some(worker_thread) = self.worker_thread.take() {
2274 let res = worker_thread.stop();
2275 self.pci_address = Some(res.address);
2276 self.sysfs_path = res.sysfs_path;
2277 self.pm_cap = res.pm_cap;
2278 self.msix_cap = res.msix_cap;
2279 self.vm_socket_vm = Some(res.vm_socket);
2280 }
2281 Ok(())
2282 }
2283
wake(&mut self) -> anyhow::Result<()>2284 fn wake(&mut self) -> anyhow::Result<()> {
2285 if self.activated {
2286 self.start_work_thread();
2287 }
2288 Ok(())
2289 }
2290 }
2291
2292 #[cfg(test)]
2293 mod tests {
2294 use resources::AddressRange;
2295
2296 use super::VfioResourceAllocator;
2297
2298 #[test]
no_overlap()2299 fn no_overlap() {
2300 // regions [32, 95]
2301 let mut memory =
2302 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2303 memory
2304 .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2305 .unwrap();
2306 memory
2307 .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2308 .unwrap();
2309
2310 let mut iter = memory.regions.iter();
2311 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2312 }
2313
2314 #[test]
complete_overlap()2315 fn complete_overlap() {
2316 // regions [32, 95]
2317 let mut memory =
2318 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2319 // regions [32, 47], [64, 95]
2320 memory
2321 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2322 .unwrap();
2323 // regions [64, 95]
2324 memory
2325 .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2326 .unwrap();
2327
2328 let mut iter = memory.regions.iter();
2329 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2330 }
2331
2332 #[test]
partial_overlap_one()2333 fn partial_overlap_one() {
2334 // regions [32, 95]
2335 let mut memory =
2336 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2337 // regions [32, 47], [64, 95]
2338 memory
2339 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2340 .unwrap();
2341 // regions [32, 39], [64, 95]
2342 memory
2343 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2344 .unwrap();
2345
2346 let mut iter = memory.regions.iter();
2347 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2348 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2349 }
2350
2351 #[test]
partial_overlap_two()2352 fn partial_overlap_two() {
2353 // regions [32, 95]
2354 let mut memory =
2355 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2356 // regions [32, 47], [64, 95]
2357 memory
2358 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2359 .unwrap();
2360 // regions [32, 39], [72, 95]
2361 memory
2362 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2363 .unwrap();
2364
2365 let mut iter = memory.regions.iter();
2366 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2367 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2368 }
2369
2370 #[test]
partial_overlap_three()2371 fn partial_overlap_three() {
2372 // regions [32, 95]
2373 let mut memory =
2374 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2375 // regions [32, 39], [48, 95]
2376 memory
2377 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2378 .unwrap();
2379 // regions [32, 39], [48, 63], [72, 95]
2380 memory
2381 .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2382 .unwrap();
2383 // regions [32, 35], [76, 95]
2384 memory
2385 .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2386 .unwrap();
2387
2388 let mut iter = memory.regions.iter();
2389 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2390 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2391 }
2392 }
2393