1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::cmp::max;
6 use std::cmp::Reverse;
7 use std::collections::BTreeMap;
8 use std::collections::BTreeSet;
9 use std::fs;
10 use std::path::Path;
11 use std::path::PathBuf;
12 use std::str::FromStr;
13 use std::sync::Arc;
14
15 use acpi_tables::aml::Aml;
16 use base::debug;
17 use base::error;
18 use base::pagesize;
19 use base::warn;
20 use base::AsRawDescriptor;
21 use base::AsRawDescriptors;
22 use base::Event;
23 use base::EventToken;
24 use base::MemoryMapping;
25 use base::Protection;
26 use base::RawDescriptor;
27 use base::Tube;
28 use base::WaitContext;
29 use base::WorkerThread;
30 use hypervisor::MemCacheType;
31 use resources::AddressRange;
32 use resources::Alloc;
33 use resources::AllocOptions;
34 use resources::MmioType;
35 use resources::SystemAllocator;
36 use sync::Mutex;
37 use vfio_sys::vfio::VFIO_PCI_ACPI_NTFY_IRQ_INDEX;
38 use vfio_sys::*;
39 use vm_control::api::VmMemoryClient;
40 use vm_control::HotPlugDeviceInfo;
41 use vm_control::HotPlugDeviceType;
42 use vm_control::VmMemoryDestination;
43 use vm_control::VmMemoryRegionId;
44 use vm_control::VmMemorySource;
45 use vm_control::VmRequest;
46 use vm_control::VmResponse;
47
48 use crate::pci::acpi::DeviceVcfgRegister;
49 use crate::pci::acpi::DsmMethod;
50 use crate::pci::acpi::PowerResourceMethod;
51 use crate::pci::acpi::SHM_OFFSET;
52 use crate::pci::msi::MsiConfig;
53 use crate::pci::msi::MsiStatus;
54 use crate::pci::msi::PCI_MSI_FLAGS;
55 use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
56 use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
57 use crate::pci::msi::PCI_MSI_NEXT_POINTER;
58 use crate::pci::msix::MsixConfig;
59 use crate::pci::msix::MsixStatus;
60 use crate::pci::msix::BITS_PER_PBA_ENTRY;
61 use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
62 use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
63 use crate::pci::pci_device::BarRange;
64 use crate::pci::pci_device::Error as PciDeviceError;
65 use crate::pci::pci_device::PciDevice;
66 use crate::pci::pci_device::PreferredIrq;
67 use crate::pci::pm::PciPmCap;
68 use crate::pci::pm::PmConfig;
69 use crate::pci::pm::PM_CAP_LENGTH;
70 use crate::pci::PciAddress;
71 use crate::pci::PciBarConfiguration;
72 use crate::pci::PciBarIndex;
73 use crate::pci::PciBarPrefetchable;
74 use crate::pci::PciBarRegionType;
75 use crate::pci::PciCapabilityID;
76 use crate::pci::PciClassCode;
77 use crate::pci::PciId;
78 use crate::pci::PciInterruptPin;
79 use crate::pci::PCI_VCFG_DSM;
80 use crate::pci::PCI_VCFG_NOTY;
81 use crate::pci::PCI_VCFG_PM;
82 use crate::pci::PCI_VENDOR_ID_INTEL;
83 use crate::vfio::VfioDevice;
84 use crate::vfio::VfioError;
85 use crate::vfio::VfioIrqType;
86 use crate::vfio::VfioPciConfig;
87 use crate::IrqLevelEvent;
88 use crate::Suspendable;
89
90 const PCI_VENDOR_ID: u32 = 0x0;
91 const PCI_DEVICE_ID: u32 = 0x2;
92 const PCI_COMMAND: u32 = 0x4;
93 const PCI_COMMAND_MEMORY: u8 = 0x2;
94 const PCI_BASE_CLASS_CODE: u32 = 0x0B;
95 const PCI_INTERRUPT_NUM: u32 = 0x3C;
96 const PCI_INTERRUPT_PIN: u32 = 0x3D;
97
98 const PCI_CAPABILITY_LIST: u32 = 0x34;
99 const PCI_CAP_ID_MSI: u8 = 0x05;
100 const PCI_CAP_ID_MSIX: u8 = 0x11;
101 const PCI_CAP_ID_PM: u8 = 0x01;
102
103 // Size of the standard PCI config space
104 const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
105 // Size of the standard PCIe config space: 4KB
106 const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
107
108 // Extended Capabilities
109 const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
110 const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
111 const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
112 const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
113
114 struct VfioPmCap {
115 offset: u32,
116 capabilities: u32,
117 config: PmConfig,
118 }
119
120 impl VfioPmCap {
new(config: &VfioPciConfig, cap_start: u32) -> Self121 fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
122 let mut capabilities: u32 = config.read_config(cap_start);
123 capabilities |= (PciPmCap::default_cap() as u32) << 16;
124 VfioPmCap {
125 offset: cap_start,
126 capabilities,
127 config: PmConfig::new(false),
128 }
129 }
130
should_trigger_pme(&mut self) -> bool131 pub fn should_trigger_pme(&mut self) -> bool {
132 self.config.should_trigger_pme()
133 }
134
is_pm_reg(&self, offset: u32) -> bool135 fn is_pm_reg(&self, offset: u32) -> bool {
136 (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
137 }
138
read(&self, offset: u32) -> u32139 pub fn read(&self, offset: u32) -> u32 {
140 let offset = offset - self.offset;
141 if offset == 0 {
142 self.capabilities
143 } else {
144 let mut data = 0;
145 self.config.read(&mut data);
146 data
147 }
148 }
149
write(&mut self, offset: u64, data: &[u8])150 pub fn write(&mut self, offset: u64, data: &[u8]) {
151 let offset = offset - self.offset as u64;
152 if offset >= std::mem::size_of::<u32>() as u64 {
153 let offset = offset - std::mem::size_of::<u32>() as u64;
154 self.config.write(offset, data);
155 }
156 }
157 }
158
159 enum VfioMsiChange {
160 Disable,
161 Enable,
162 FunctionChanged,
163 }
164
165 struct VfioMsiCap {
166 config: MsiConfig,
167 offset: u32,
168 }
169
170 impl VfioMsiCap {
new( config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube, device_id: u32, device_name: String, ) -> Self171 fn new(
172 config: &VfioPciConfig,
173 msi_cap_start: u32,
174 vm_socket_irq: Tube,
175 device_id: u32,
176 device_name: String,
177 ) -> Self {
178 let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
179 let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
180 let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
181
182 VfioMsiCap {
183 config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
184 offset: msi_cap_start,
185 }
186 }
187
is_msi_reg(&self, index: u64, len: usize) -> bool188 fn is_msi_reg(&self, index: u64, len: usize) -> bool {
189 self.config.is_msi_reg(self.offset, index, len)
190 }
191
write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>192 fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
193 let offset = index as u32 - self.offset;
194 match self.config.write_msi_capability(offset, data) {
195 MsiStatus::Enabled => Some(VfioMsiChange::Enable),
196 MsiStatus::Disabled => Some(VfioMsiChange::Disable),
197 MsiStatus::NothingToDo => None,
198 }
199 }
200
get_msi_irqfd(&self) -> Option<&Event>201 fn get_msi_irqfd(&self) -> Option<&Event> {
202 self.config.get_irqfd()
203 }
204
destroy(&mut self)205 fn destroy(&mut self) {
206 self.config.destroy()
207 }
208 }
209
210 // MSI-X registers in MSI-X capability
211 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
212 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
213 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
214 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
215 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
216 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
217 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
218 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
219
220 struct VfioMsixCap {
221 config: MsixConfig,
222 offset: u32,
223 table_size: u16,
224 table_pci_bar: PciBarIndex,
225 table_offset: u64,
226 table_size_bytes: u64,
227 pba_pci_bar: PciBarIndex,
228 pba_offset: u64,
229 pba_size_bytes: u64,
230 msix_interrupt_evt: Vec<Event>,
231 }
232
233 impl VfioMsixCap {
new( config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube, pci_id: u32, device_name: String, ) -> Self234 fn new(
235 config: &VfioPciConfig,
236 msix_cap_start: u32,
237 vm_socket_irq: Tube,
238 pci_id: u32,
239 device_name: String,
240 ) -> Self {
241 let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
242 let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
243 let table_pci_bar = (table & PCI_MSIX_TABLE_BIR) as PciBarIndex;
244 let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
245 let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
246 let pba_pci_bar = (pba & PCI_MSIX_PBA_BIR) as PciBarIndex;
247 let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
248
249 let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
250 if table_pci_bar == pba_pci_bar
251 && pba_offset > table_offset
252 && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
253 {
254 table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
255 }
256
257 let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
258 let pba_size_bytes =
259 table_size.div_ceil(BITS_PER_PBA_ENTRY as u64) * MSIX_PBA_ENTRIES_MODULO;
260 let mut msix_interrupt_evt = Vec::new();
261 for _ in 0..table_size {
262 msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
263 }
264 VfioMsixCap {
265 config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
266 offset: msix_cap_start,
267 table_size: table_size as u16,
268 table_pci_bar,
269 table_offset,
270 table_size_bytes,
271 pba_pci_bar,
272 pba_offset,
273 pba_size_bytes,
274 msix_interrupt_evt,
275 }
276 }
277
278 // only msix control register is writable and need special handle in pci r/w
is_msix_control_reg(&self, offset: u32, size: u32) -> bool279 fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
280 let control_start = self.offset + PCI_MSIX_FLAGS;
281 let control_end = control_start + 2;
282
283 offset < control_end && offset + size > control_start
284 }
285
read_msix_control(&self, data: &mut u32)286 fn read_msix_control(&self, data: &mut u32) {
287 *data = self.config.read_msix_capability(*data);
288 }
289
write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>290 fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
291 let old_enabled = self.config.enabled();
292 let old_masked = self.config.masked();
293
294 self.config
295 .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
296
297 let new_enabled = self.config.enabled();
298 let new_masked = self.config.masked();
299
300 if !old_enabled && new_enabled {
301 Some(VfioMsiChange::Enable)
302 } else if old_enabled && !new_enabled {
303 Some(VfioMsiChange::Disable)
304 } else if new_enabled && old_masked != new_masked {
305 Some(VfioMsiChange::FunctionChanged)
306 } else {
307 None
308 }
309 }
310
is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool311 fn is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool {
312 bar_index == self.table_pci_bar
313 && offset >= self.table_offset
314 && offset < self.table_offset + self.table_size_bytes
315 }
316
get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange>317 fn get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
318 if bar_index == self.table_pci_bar {
319 AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
320 } else {
321 None
322 }
323 }
324
read_table(&self, offset: u64, data: &mut [u8])325 fn read_table(&self, offset: u64, data: &mut [u8]) {
326 let offset = offset - self.table_offset;
327 self.config.read_msix_table(offset, data);
328 }
329
write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus330 fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
331 let offset = offset - self.table_offset;
332 self.config.write_msix_table(offset, data)
333 }
334
is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool335 fn is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool {
336 bar_index == self.pba_pci_bar
337 && offset >= self.pba_offset
338 && offset < self.pba_offset + self.pba_size_bytes
339 }
340
get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange>341 fn get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
342 if bar_index == self.pba_pci_bar {
343 AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
344 } else {
345 None
346 }
347 }
348
read_pba(&self, offset: u64, data: &mut [u8])349 fn read_pba(&self, offset: u64, data: &mut [u8]) {
350 let offset = offset - self.pba_offset;
351 self.config.read_pba_entries(offset, data);
352 }
353
write_pba(&mut self, offset: u64, data: &[u8])354 fn write_pba(&mut self, offset: u64, data: &[u8]) {
355 let offset = offset - self.pba_offset;
356 self.config.write_pba_entries(offset, data);
357 }
358
get_msix_irqfd(&self, index: usize) -> Option<&Event>359 fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
360 let irqfd = self.config.get_irqfd(index);
361 if let Some(fd) = irqfd {
362 if self.msix_vector_masked(index) {
363 Some(&self.msix_interrupt_evt[index])
364 } else {
365 Some(fd)
366 }
367 } else {
368 None
369 }
370 }
371
get_msix_irqfds(&self) -> Vec<Option<&Event>>372 fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
373 let mut irqfds = Vec::new();
374
375 for i in 0..self.table_size {
376 irqfds.push(self.get_msix_irqfd(i as usize));
377 }
378
379 irqfds
380 }
381
table_size(&self) -> usize382 fn table_size(&self) -> usize {
383 self.table_size.into()
384 }
385
clone_msix_evt(&self) -> Vec<Event>386 fn clone_msix_evt(&self) -> Vec<Event> {
387 self.msix_interrupt_evt
388 .iter()
389 .map(|irq| irq.try_clone().unwrap())
390 .collect()
391 }
392
msix_vector_masked(&self, index: usize) -> bool393 fn msix_vector_masked(&self, index: usize) -> bool {
394 !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
395 }
396
trigger(&mut self, index: usize)397 fn trigger(&mut self, index: usize) {
398 self.config.trigger(index as u16);
399 }
400
destroy(&mut self)401 fn destroy(&mut self) {
402 self.config.destroy()
403 }
404 }
405
406 struct VfioResourceAllocator {
407 // The region that is not allocated yet.
408 regions: BTreeSet<AddressRange>,
409 }
410
411 impl VfioResourceAllocator {
412 // Creates a new `VfioResourceAllocator` for managing VFIO resources.
413 // Can return `Err` if `base` + `size` overflows a u64.
414 //
415 // * `base` - The starting address of the range to manage.
416 // * `size` - The size of the address range in bytes.
new(pool: AddressRange) -> Result<Self, PciDeviceError>417 fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
418 if pool.is_empty() {
419 return Err(PciDeviceError::SizeZero);
420 }
421 let mut regions = BTreeSet::new();
422 regions.insert(pool);
423 Ok(VfioResourceAllocator { regions })
424 }
425
internal_allocate_from_slot( &mut self, slot: AddressRange, range: AddressRange, ) -> Result<u64, PciDeviceError>426 fn internal_allocate_from_slot(
427 &mut self,
428 slot: AddressRange,
429 range: AddressRange,
430 ) -> Result<u64, PciDeviceError> {
431 let slot_was_present = self.regions.remove(&slot);
432 assert!(slot_was_present);
433
434 let (before, after) = slot.non_overlapping_ranges(range);
435
436 if !before.is_empty() {
437 self.regions.insert(before);
438 }
439 if !after.is_empty() {
440 self.regions.insert(after);
441 }
442
443 Ok(range.start)
444 }
445
446 // Allocates a range of addresses from the managed region with a minimal alignment.
447 // Overlapping with a previous allocation is _not_ allowed.
448 // Returns allocated address.
allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError>449 fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
450 if size == 0 {
451 return Err(PciDeviceError::SizeZero);
452 }
453 if !alignment.is_power_of_two() {
454 return Err(PciDeviceError::BadAlignment);
455 }
456
457 // finds first region matching alignment and size.
458 let region = self.regions.iter().find(|range| {
459 match range.start % alignment {
460 0 => range.start.checked_add(size - 1),
461 r => range.start.checked_add(size - 1 + alignment - r),
462 }
463 .is_some_and(|end| end <= range.end)
464 });
465
466 match region {
467 Some(&slot) => {
468 let start = match slot.start % alignment {
469 0 => slot.start,
470 r => slot.start + alignment - r,
471 };
472 let end = start + size - 1;
473 let range = AddressRange::from_start_and_end(start, end);
474
475 self.internal_allocate_from_slot(slot, range)
476 }
477 None => Err(PciDeviceError::OutOfSpace),
478 }
479 }
480
481 // Allocates a range of addresses from the managed region with a required location.
482 // Overlapping with a previous allocation is allowed.
allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError>483 fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
484 if range.is_empty() {
485 return Err(PciDeviceError::SizeZero);
486 }
487
488 while let Some(&slot) = self
489 .regions
490 .iter()
491 .find(|avail_range| avail_range.overlaps(range))
492 {
493 let _address = self.internal_allocate_from_slot(slot, range)?;
494 }
495 Ok(())
496 }
497 }
498
499 struct VfioPciWorker {
500 address: PciAddress,
501 sysfs_path: PathBuf,
502 vm_socket: Tube,
503 name: String,
504 pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
505 msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
506 }
507
508 impl VfioPciWorker {
run( &mut self, req_irq_evt: Event, wakeup_evt: Event, acpi_notify_evt: Event, kill_evt: Event, msix_evt: Vec<Event>, is_in_low_power: Arc<Mutex<bool>>, gpe: Option<u32>, notification_val: Arc<Mutex<Vec<u32>>>, )509 fn run(
510 &mut self,
511 req_irq_evt: Event,
512 wakeup_evt: Event,
513 acpi_notify_evt: Event,
514 kill_evt: Event,
515 msix_evt: Vec<Event>,
516 is_in_low_power: Arc<Mutex<bool>>,
517 gpe: Option<u32>,
518 notification_val: Arc<Mutex<Vec<u32>>>,
519 ) {
520 #[derive(EventToken, Debug)]
521 enum Token {
522 ReqIrq,
523 WakeUp,
524 AcpiNotifyEvent,
525 Kill,
526 MsixIrqi { index: usize },
527 }
528
529 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
530 (&req_irq_evt, Token::ReqIrq),
531 (&wakeup_evt, Token::WakeUp),
532 (&acpi_notify_evt, Token::AcpiNotifyEvent),
533 (&kill_evt, Token::Kill),
534 ]) {
535 Ok(pc) => pc,
536 Err(e) => {
537 error!(
538 "{} failed creating vfio WaitContext: {}",
539 self.name.clone(),
540 e
541 );
542 return;
543 }
544 };
545
546 for (index, msix_int) in msix_evt.iter().enumerate() {
547 wait_ctx
548 .add(msix_int, Token::MsixIrqi { index })
549 .expect("Failed to create vfio WaitContext for msix interrupt event")
550 }
551
552 'wait: loop {
553 let events = match wait_ctx.wait() {
554 Ok(v) => v,
555 Err(e) => {
556 error!("{} failed polling vfio events: {}", self.name.clone(), e);
557 break;
558 }
559 };
560
561 for event in events.iter().filter(|e| e.is_readable) {
562 match event.token {
563 Token::MsixIrqi { index } => {
564 if let Some(msix_cap) = &self.msix_cap {
565 msix_cap.lock().trigger(index);
566 }
567 }
568 Token::ReqIrq => {
569 let device = HotPlugDeviceInfo {
570 device_type: HotPlugDeviceType::EndPoint,
571 path: self.sysfs_path.clone(),
572 hp_interrupt: false,
573 };
574
575 let request = VmRequest::HotPlugVfioCommand { device, add: false };
576 if self.vm_socket.send(&request).is_ok() {
577 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
578 error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
579 } else {
580 break 'wait;
581 }
582 }
583 }
584 Token::WakeUp => {
585 let _ = wakeup_evt.wait();
586
587 if *is_in_low_power.lock() {
588 if let Some(pm_cap) = &self.pm_cap {
589 if pm_cap.lock().should_trigger_pme() {
590 let request =
591 VmRequest::PciPme(self.address.pme_requester_id());
592 if self.vm_socket.send(&request).is_ok() {
593 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
594 error!(
595 "{} failed to send PME: {}",
596 self.name.clone(),
597 e
598 );
599 }
600 }
601 }
602 }
603 }
604 }
605 Token::AcpiNotifyEvent => {
606 if let Some(gpe) = gpe {
607 if let Ok(val) = base::EventExt::read_count(&acpi_notify_evt) {
608 notification_val.lock().push(val as u32);
609 let request = VmRequest::Gpe {
610 gpe,
611 clear_evt: None,
612 };
613 if self.vm_socket.send(&request).is_ok() {
614 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
615 error!("{} failed to send GPE: {}", self.name.clone(), e);
616 }
617 }
618 } else {
619 error!("{} failed to read acpi_notify_evt", self.name.clone());
620 }
621 }
622 }
623 Token::Kill => break 'wait,
624 }
625 }
626 }
627 }
628 }
629
get_next_from_extcap_header(cap_header: u32) -> u32630 fn get_next_from_extcap_header(cap_header: u32) -> u32 {
631 (cap_header >> 20) & 0xffc
632 }
633
is_skipped_ext_cap(cap_id: u16) -> bool634 fn is_skipped_ext_cap(cap_id: u16) -> bool {
635 matches!(
636 cap_id,
637 // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
638 PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
639 )
640 }
641
642 enum DeviceData {
643 IntelGfxData { opregion_index: u32 },
644 }
645
646 /// PCI Express Extended Capabilities information
647 #[derive(Copy, Clone)]
648 struct ExtCap {
649 /// cap offset in Configuration Space
650 offset: u32,
651 /// cap size
652 size: u32,
653 /// next offset, set next non-skipped offset for non-skipped ext cap
654 next: u16,
655 /// whether to be exposed to guest
656 is_skipped: bool,
657 }
658
659 /// Implements the Vfio Pci device, then a pci device is added into vm
660 pub struct VfioPciDevice {
661 device: Arc<VfioDevice>,
662 config: VfioPciConfig,
663 hotplug: bool,
664 hotplug_bus_number: Option<u8>,
665 preferred_address: PciAddress,
666 pci_address: Option<PciAddress>,
667 interrupt_evt: Option<IrqLevelEvent>,
668 acpi_notification_evt: Option<Event>,
669 mmio_regions: Vec<PciBarConfiguration>,
670 io_regions: Vec<PciBarConfiguration>,
671 pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
672 msi_cap: Option<VfioMsiCap>,
673 msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
674 irq_type: Option<VfioIrqType>,
675 vm_memory_client: VmMemoryClient,
676 device_data: Option<DeviceData>,
677 pm_evt: Option<Event>,
678 is_in_low_power: Arc<Mutex<bool>>,
679 worker_thread: Option<WorkerThread<VfioPciWorker>>,
680 vm_socket_vm: Option<Tube>,
681 sysfs_path: PathBuf,
682 // PCI Express Extended Capabilities
683 ext_caps: Vec<ExtCap>,
684 vcfg_shm_mmap: Option<MemoryMapping>,
685 mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<VmMemoryRegionId>)>,
686 activated: bool,
687 acpi_notifier_val: Arc<Mutex<Vec<u32>>>,
688 gpe: Option<u32>,
689 base_class_code: PciClassCode,
690 }
691
692 impl VfioPciDevice {
693 /// Constructs a new Vfio Pci device for the give Vfio device
new( sysfs_path: &Path, device: VfioDevice, hotplug: bool, hotplug_bus_number: Option<u8>, guest_address: Option<PciAddress>, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vm_memory_client: VmMemoryClient, vfio_device_socket_vm: Tube, ) -> Result<Self, PciDeviceError>694 pub fn new(
695 sysfs_path: &Path,
696 device: VfioDevice,
697 hotplug: bool,
698 hotplug_bus_number: Option<u8>,
699 guest_address: Option<PciAddress>,
700 vfio_device_socket_msi: Tube,
701 vfio_device_socket_msix: Tube,
702 vm_memory_client: VmMemoryClient,
703 vfio_device_socket_vm: Tube,
704 ) -> Result<Self, PciDeviceError> {
705 let preferred_address = if let Some(bus_num) = hotplug_bus_number {
706 debug!("hotplug bus {}", bus_num);
707 PciAddress {
708 // Caller specify pcie bus number for hotplug device
709 bus: bus_num,
710 // devfn should be 0, otherwise pcie root port couldn't detect it
711 dev: 0,
712 func: 0,
713 }
714 } else if let Some(guest_address) = guest_address {
715 debug!("guest PCI address {}", guest_address);
716 guest_address
717 } else {
718 let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
719 PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
720 })?;
721 debug!("parsed device PCI address {}", addr);
722 addr
723 };
724
725 let dev = Arc::new(device);
726 let config = VfioPciConfig::new(Arc::clone(&dev));
727 let mut msi_socket = Some(vfio_device_socket_msi);
728 let mut msix_socket = Some(vfio_device_socket_msix);
729 let mut msi_cap: Option<VfioMsiCap> = None;
730 let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
731 let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
732
733 let mut is_pcie = false;
734 let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
735 let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
736 let device_id: u16 = config.read_config(PCI_DEVICE_ID);
737 let base_class_code = PciClassCode::try_from(config.read_config::<u8>(PCI_BASE_CLASS_CODE))
738 .unwrap_or(PciClassCode::Other);
739
740 let pci_id = PciId::new(vendor_id, device_id);
741
742 while cap_next != 0 {
743 let cap_id: u8 = config.read_config(cap_next);
744 if cap_id == PCI_CAP_ID_PM {
745 pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
746 } else if cap_id == PCI_CAP_ID_MSI {
747 if let Some(msi_socket) = msi_socket.take() {
748 msi_cap = Some(VfioMsiCap::new(
749 &config,
750 cap_next,
751 msi_socket,
752 pci_id.into(),
753 dev.device_name().to_string(),
754 ));
755 }
756 } else if cap_id == PCI_CAP_ID_MSIX {
757 if let Some(msix_socket) = msix_socket.take() {
758 msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
759 &config,
760 cap_next,
761 msix_socket,
762 pci_id.into(),
763 dev.device_name().to_string(),
764 ))));
765 }
766 } else if cap_id == PciCapabilityID::PciExpress as u8 {
767 is_pcie = true;
768 }
769 let offset = cap_next + PCI_MSI_NEXT_POINTER;
770 cap_next = config.read_config::<u8>(offset).into();
771 }
772
773 let mut ext_caps: Vec<ExtCap> = Vec::new();
774 if is_pcie {
775 let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
776 while ext_cap_next != 0 {
777 let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
778 if ext_cap_config == 0 {
779 break;
780 }
781 ext_caps.push(ExtCap {
782 offset: ext_cap_next,
783 // Calculate the size later
784 size: 0,
785 // init as the real value
786 next: get_next_from_extcap_header(ext_cap_config) as u16,
787 is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
788 });
789 ext_cap_next = get_next_from_extcap_header(ext_cap_config);
790 }
791
792 // Manage extended caps
793 //
794 // Extended capabilities are chained with each pointing to the next, so
795 // we can drop anything other than the head of the chain simply by
796 // modifying the previous next pointer. For the head of the chain, we
797 // can modify the capability ID to something that cannot match a valid
798 // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
799 // supported.
800 //
801 // reverse order by offset
802 ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
803 let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
804 let mut non_skipped_next: u16 = 0;
805 for ext_cap in ext_caps.iter_mut() {
806 if !ext_cap.is_skipped {
807 ext_cap.next = non_skipped_next;
808 non_skipped_next = ext_cap.offset as u16;
809 } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
810 ext_cap.next = non_skipped_next;
811 }
812 ext_cap.size = next_offset - ext_cap.offset;
813 next_offset = ext_cap.offset;
814 }
815 // order by offset
816 ext_caps.reverse();
817 }
818
819 let is_intel_gfx =
820 base_class_code == PciClassCode::DisplayController && vendor_id == PCI_VENDOR_ID_INTEL;
821 let device_data = if is_intel_gfx {
822 Some(DeviceData::IntelGfxData {
823 opregion_index: u32::MAX,
824 })
825 } else {
826 None
827 };
828
829 Ok(VfioPciDevice {
830 device: dev,
831 config,
832 hotplug,
833 hotplug_bus_number,
834 preferred_address,
835 pci_address: None,
836 interrupt_evt: None,
837 acpi_notification_evt: None,
838 mmio_regions: Vec::new(),
839 io_regions: Vec::new(),
840 pm_cap,
841 msi_cap,
842 msix_cap,
843 irq_type: None,
844 vm_memory_client,
845 device_data,
846 pm_evt: None,
847 is_in_low_power: Arc::new(Mutex::new(false)),
848 worker_thread: None,
849 vm_socket_vm: Some(vfio_device_socket_vm),
850 sysfs_path: sysfs_path.to_path_buf(),
851 ext_caps,
852 vcfg_shm_mmap: None,
853 mapped_mmio_bars: BTreeMap::new(),
854 activated: false,
855 acpi_notifier_val: Arc::new(Mutex::new(Vec::new())),
856 gpe: None,
857 base_class_code,
858 })
859 }
860
861 /// Gets the pci address of the device, if one has already been allocated.
pci_address(&self) -> Option<PciAddress>862 pub fn pci_address(&self) -> Option<PciAddress> {
863 self.pci_address
864 }
865
is_gfx(&self) -> bool866 pub fn is_gfx(&self) -> bool {
867 self.base_class_code == PciClassCode::DisplayController
868 }
869
is_intel_gfx(&self) -> bool870 fn is_intel_gfx(&self) -> bool {
871 matches!(self.device_data, Some(DeviceData::IntelGfxData { .. }))
872 }
873
enable_acpi_notification(&mut self) -> Result<(), PciDeviceError>874 fn enable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
875 if let Some(ref acpi_notification_evt) = self.acpi_notification_evt {
876 return self
877 .device
878 .acpi_notification_evt_enable(acpi_notification_evt, VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
879 .map_err(|_| PciDeviceError::AcpiNotifySetupFailed);
880 }
881 Err(PciDeviceError::AcpiNotifySetupFailed)
882 }
883
884 #[allow(dead_code)]
disable_acpi_notification(&mut self) -> Result<(), PciDeviceError>885 fn disable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
886 if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
887 return self
888 .device
889 .acpi_notification_disable(VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
890 .map_err(|_| PciDeviceError::AcpiNotifyDeactivationFailed);
891 }
892 Err(PciDeviceError::AcpiNotifyDeactivationFailed)
893 }
894
895 #[allow(dead_code)]
test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError>896 fn test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError> {
897 if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
898 return self
899 .device
900 .acpi_notification_test(VFIO_PCI_ACPI_NTFY_IRQ_INDEX, val)
901 .map_err(|_| PciDeviceError::AcpiNotifyTestFailed);
902 }
903 Err(PciDeviceError::AcpiNotifyTestFailed)
904 }
905
enable_intx(&mut self)906 fn enable_intx(&mut self) {
907 if let Some(ref interrupt_evt) = self.interrupt_evt {
908 if let Err(e) = self.device.irq_enable(
909 &[Some(interrupt_evt.get_trigger())],
910 VFIO_PCI_INTX_IRQ_INDEX,
911 0,
912 ) {
913 error!("{} Intx enable failed: {}", self.debug_label(), e);
914 return;
915 }
916 if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
917 error!("{} Intx mask failed: {}", self.debug_label(), e);
918 self.disable_intx();
919 return;
920 }
921 if let Err(e) = self
922 .device
923 .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
924 {
925 error!("{} resample enable failed: {}", self.debug_label(), e);
926 self.disable_intx();
927 return;
928 }
929 if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
930 error!("{} Intx unmask failed: {}", self.debug_label(), e);
931 self.disable_intx();
932 return;
933 }
934 self.irq_type = Some(VfioIrqType::Intx);
935 }
936 }
937
disable_intx(&mut self)938 fn disable_intx(&mut self) {
939 if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
940 error!("{} Intx disable failed: {}", self.debug_label(), e);
941 }
942 self.irq_type = None;
943 }
944
disable_irqs(&mut self)945 fn disable_irqs(&mut self) {
946 match self.irq_type {
947 Some(VfioIrqType::Msi) => self.disable_msi(),
948 Some(VfioIrqType::Msix) => self.disable_msix(),
949 _ => (),
950 }
951
952 // Above disable_msi() or disable_msix() will enable intx again.
953 // so disable_intx here again.
954 if let Some(VfioIrqType::Intx) = self.irq_type {
955 self.disable_intx();
956 }
957 }
958
enable_msi(&mut self)959 fn enable_msi(&mut self) {
960 self.disable_irqs();
961
962 let irqfd = match &self.msi_cap {
963 Some(cap) => {
964 if let Some(fd) = cap.get_msi_irqfd() {
965 fd
966 } else {
967 self.enable_intx();
968 return;
969 }
970 }
971 None => {
972 self.enable_intx();
973 return;
974 }
975 };
976
977 if let Err(e) = self
978 .device
979 .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
980 {
981 error!("{} failed to enable msi: {}", self.debug_label(), e);
982 self.enable_intx();
983 return;
984 }
985
986 self.irq_type = Some(VfioIrqType::Msi);
987 }
988
disable_msi(&mut self)989 fn disable_msi(&mut self) {
990 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
991 error!("{} failed to disable msi: {}", self.debug_label(), e);
992 return;
993 }
994 self.irq_type = None;
995
996 self.enable_intx();
997 }
998
enable_msix(&mut self)999 fn enable_msix(&mut self) {
1000 if self.msix_cap.is_none() {
1001 return;
1002 }
1003
1004 self.disable_irqs();
1005 let cap = self.msix_cap.as_ref().unwrap().lock();
1006 let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1007
1008 let mut failed = false;
1009 if !vector_in_use {
1010 // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1011 // to vector 0. Then we enable it and immediately disable it, so that vfio will
1012 // activate physical device. If there are available msix vectors, just enable them
1013 // instead.
1014 let fd = Event::new().expect("failed to create event");
1015 let table_size = cap.table_size();
1016 let mut irqfds = vec![None; table_size];
1017 irqfds[0] = Some(&fd);
1018 for fd in irqfds.iter_mut().skip(1) {
1019 *fd = None;
1020 }
1021 if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1022 error!("{} failed to enable msix: {}", self.debug_label(), e);
1023 failed = true;
1024 }
1025 irqfds[0] = None;
1026 if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1027 error!("{} failed to enable msix: {}", self.debug_label(), e);
1028 failed = true;
1029 }
1030 } else {
1031 let result = self
1032 .device
1033 .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1034 if let Err(e) = result {
1035 error!("{} failed to enable msix: {}", self.debug_label(), e);
1036 failed = true;
1037 }
1038 }
1039
1040 std::mem::drop(cap);
1041 if failed {
1042 self.enable_intx();
1043 return;
1044 }
1045 self.irq_type = Some(VfioIrqType::Msix);
1046 }
1047
disable_msix(&mut self)1048 fn disable_msix(&mut self) {
1049 if self.msix_cap.is_none() {
1050 return;
1051 }
1052 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1053 error!("{} failed to disable msix: {}", self.debug_label(), e);
1054 return;
1055 }
1056 self.irq_type = None;
1057 self.enable_intx();
1058 }
1059
msix_vectors_update(&self) -> Result<(), VfioError>1060 fn msix_vectors_update(&self) -> Result<(), VfioError> {
1061 if let Some(cap) = &self.msix_cap {
1062 self.device
1063 .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1064 }
1065 Ok(())
1066 }
1067
msix_vector_update(&self, index: usize, irqfd: Option<&Event>)1068 fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1069 if let Err(e) = self
1070 .device
1071 .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1072 {
1073 error!(
1074 "{} failed to update msix vector {}: {}",
1075 self.debug_label(),
1076 index,
1077 e
1078 );
1079 }
1080 }
1081
adjust_bar_mmap( &self, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, remove_mmaps: &[AddressRange], ) -> Vec<vfio_region_sparse_mmap_area>1082 fn adjust_bar_mmap(
1083 &self,
1084 bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1085 remove_mmaps: &[AddressRange],
1086 ) -> Vec<vfio_region_sparse_mmap_area> {
1087 let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1088 let pgmask = (pagesize() as u64) - 1;
1089
1090 for mmap in bar_mmaps.iter() {
1091 let mmap_range = if let Some(mmap_range) =
1092 AddressRange::from_start_and_size(mmap.offset, mmap.size)
1093 {
1094 mmap_range
1095 } else {
1096 continue;
1097 };
1098 let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1099 Ok(a) => a,
1100 Err(e) => {
1101 error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1102 mmaps.clear();
1103 return mmaps;
1104 }
1105 };
1106
1107 for &(mut remove_range) in remove_mmaps.iter() {
1108 remove_range = remove_range.intersect(mmap_range);
1109 if !remove_range.is_empty() {
1110 // align offsets to page size
1111 let begin = remove_range.start & !pgmask;
1112 let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1113 let remove_range = AddressRange::from_start_and_end(begin, end);
1114 if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1115 error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1116 }
1117 }
1118 }
1119
1120 for mmap in to_mmap.regions {
1121 mmaps.push(vfio_region_sparse_mmap_area {
1122 offset: mmap.start,
1123 size: mmap.end - mmap.start + 1,
1124 });
1125 }
1126 }
1127
1128 mmaps
1129 }
1130
remove_bar_mmap_msix( &self, bar_index: PciBarIndex, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1131 fn remove_bar_mmap_msix(
1132 &self,
1133 bar_index: PciBarIndex,
1134 bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1135 ) -> Vec<vfio_region_sparse_mmap_area> {
1136 let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1137 let mut msix_regions = Vec::new();
1138
1139 if let Some(t) = msix_cap.get_msix_table(bar_index) {
1140 msix_regions.push(t);
1141 }
1142 if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1143 msix_regions.push(p);
1144 }
1145
1146 if msix_regions.is_empty() {
1147 return bar_mmaps;
1148 }
1149
1150 self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1151 }
1152
add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId>1153 fn add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId> {
1154 let mut mmaps_ids: Vec<VmMemoryRegionId> = Vec::new();
1155 if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1156 // the bar storing msix table and pba couldn't mmap.
1157 // these bars should be trapped, so that msix could be emulated.
1158 let mut mmaps = self.device.get_region_mmap(index);
1159
1160 if self.msix_cap.is_some() {
1161 mmaps = self.remove_bar_mmap_msix(index, mmaps);
1162 }
1163 if mmaps.is_empty() {
1164 return mmaps_ids;
1165 }
1166
1167 for mmap in mmaps.iter() {
1168 let mmap_offset = mmap.offset;
1169 let mmap_size = mmap.size;
1170 let guest_map_start = bar_addr + mmap_offset;
1171 let region_offset = self.device.get_region_offset(index);
1172 let offset = region_offset + mmap_offset;
1173 let descriptor = match self.device.device_file().try_clone() {
1174 Ok(device_file) => device_file.into(),
1175 Err(_) => break,
1176 };
1177 match self.vm_memory_client.register_memory(
1178 VmMemorySource::Descriptor {
1179 descriptor,
1180 offset,
1181 size: mmap_size,
1182 },
1183 VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1184 Protection::read_write(),
1185 MemCacheType::CacheCoherent,
1186 ) {
1187 Ok(id) => {
1188 mmaps_ids.push(id);
1189 }
1190 Err(e) => {
1191 error!("register_memory failed: {}", e);
1192 break;
1193 }
1194 }
1195 }
1196 }
1197
1198 mmaps_ids
1199 }
1200
remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId])1201 fn remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId]) {
1202 for mmap_id in mmap_ids {
1203 if let Err(e) = self.vm_memory_client.unregister_memory(*mmap_id) {
1204 error!("unregister_memory failed: {}", e);
1205 }
1206 }
1207 }
1208
disable_bars_mmap(&mut self)1209 fn disable_bars_mmap(&mut self) {
1210 for (_, (_, mmap_ids)) in self.mapped_mmio_bars.iter() {
1211 self.remove_bar_mmap(mmap_ids);
1212 }
1213 self.mapped_mmio_bars.clear();
1214 }
1215
commit_bars_mmap(&mut self)1216 fn commit_bars_mmap(&mut self) {
1217 // Unmap all bars before remapping bars, to prevent issues with overlap
1218 let mut needs_map = Vec::new();
1219 for mmio_info in self.mmio_regions.iter() {
1220 let bar_idx = mmio_info.bar_index();
1221 let addr = mmio_info.address();
1222
1223 if let Some((cur_addr, ids)) = self.mapped_mmio_bars.remove(&bar_idx) {
1224 if cur_addr == addr {
1225 self.mapped_mmio_bars.insert(bar_idx, (cur_addr, ids));
1226 continue;
1227 } else {
1228 self.remove_bar_mmap(&ids);
1229 }
1230 }
1231
1232 if addr != 0 {
1233 needs_map.push((bar_idx, addr));
1234 }
1235 }
1236
1237 for (bar_idx, addr) in needs_map.iter() {
1238 let ids = self.add_bar_mmap(*bar_idx, *addr);
1239 self.mapped_mmio_bars.insert(*bar_idx, (*addr, ids));
1240 }
1241 }
1242
close(&mut self)1243 fn close(&mut self) {
1244 if let Some(msi) = self.msi_cap.as_mut() {
1245 msi.destroy();
1246 }
1247 if let Some(msix) = &self.msix_cap {
1248 msix.lock().destroy();
1249 }
1250 self.disable_bars_mmap();
1251 self.device.close();
1252 }
1253
start_work_thread(&mut self)1254 fn start_work_thread(&mut self) {
1255 let vm_socket = match self.vm_socket_vm.take() {
1256 Some(socket) => socket,
1257 None => return,
1258 };
1259
1260 let req_evt = match Event::new() {
1261 Ok(evt) => {
1262 if let Err(e) = self
1263 .device
1264 .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1265 {
1266 error!("{} enable req_irq failed: {}", self.debug_label(), e);
1267 return;
1268 }
1269 evt
1270 }
1271 Err(_) => return,
1272 };
1273
1274 let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1275 Ok(v) => v,
1276 Err(e) => {
1277 error!(
1278 "{} failed creating PM Event pair: {}",
1279 self.debug_label(),
1280 e
1281 );
1282 return;
1283 }
1284 };
1285 self.pm_evt = Some(self_pm_evt);
1286
1287 let (self_acpi_notify_evt, acpi_notify_evt) =
1288 match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1289 Ok(v) => v,
1290 Err(e) => {
1291 error!(
1292 "{} failed creating ACPI Event pair: {}",
1293 self.debug_label(),
1294 e
1295 );
1296 return;
1297 }
1298 };
1299 self.acpi_notification_evt = Some(self_acpi_notify_evt);
1300
1301 if let Err(e) = self.enable_acpi_notification() {
1302 error!("{}: {}", self.debug_label(), e);
1303 }
1304
1305 let mut msix_evt = Vec::new();
1306 if let Some(msix_cap) = &self.msix_cap {
1307 msix_evt = msix_cap.lock().clone_msix_evt();
1308 }
1309
1310 let name = self.device.device_name().to_string();
1311 let address = self.pci_address.expect("Unassigned PCI Address.");
1312 let sysfs_path = self.sysfs_path.clone();
1313 let pm_cap = self.pm_cap.clone();
1314 let msix_cap = self.msix_cap.clone();
1315 let is_in_low_power = self.is_in_low_power.clone();
1316 let gpe_nr = self.gpe;
1317 let notification_val = self.acpi_notifier_val.clone();
1318 self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1319 let mut worker = VfioPciWorker {
1320 address,
1321 sysfs_path,
1322 vm_socket,
1323 name,
1324 pm_cap,
1325 msix_cap,
1326 };
1327 worker.run(
1328 req_evt,
1329 pm_evt,
1330 acpi_notify_evt,
1331 kill_evt,
1332 msix_evt,
1333 is_in_low_power,
1334 gpe_nr,
1335 notification_val,
1336 );
1337 worker
1338 }));
1339 self.activated = true;
1340 }
1341
collect_bars(&mut self) -> Vec<PciBarConfiguration>1342 fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1343 let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1344 let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1345
1346 while i <= VFIO_PCI_ROM_REGION_INDEX {
1347 let mut low: u32 = 0xffffffff;
1348 let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1349 0x30
1350 } else {
1351 0x10 + i * 4
1352 };
1353 self.config.write_config(low, offset);
1354 low = self.config.read_config(offset);
1355
1356 let low_flag = low & 0xf;
1357 let is_64bit = low_flag & 0x4 == 0x4;
1358 if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1359 let mut upper: u32 = 0xffffffff;
1360 if is_64bit {
1361 self.config.write_config(upper, offset + 4);
1362 upper = self.config.read_config(offset + 4);
1363 }
1364
1365 low &= 0xffff_fff0;
1366 let mut size: u64 = u64::from(upper);
1367 size <<= 32;
1368 size |= u64::from(low);
1369 size = !size + 1;
1370 let region_type = if is_64bit {
1371 PciBarRegionType::Memory64BitRegion
1372 } else {
1373 PciBarRegionType::Memory32BitRegion
1374 };
1375 let prefetch = if low_flag & 0x8 == 0x8 {
1376 PciBarPrefetchable::Prefetchable
1377 } else {
1378 PciBarPrefetchable::NotPrefetchable
1379 };
1380 mem_bars.push(PciBarConfiguration::new(
1381 i as usize,
1382 size,
1383 region_type,
1384 prefetch,
1385 ));
1386 } else if low_flag & 0x1 == 0x1 {
1387 let size = !(low & 0xffff_fffc) + 1;
1388 self.io_regions.push(PciBarConfiguration::new(
1389 i as usize,
1390 size.into(),
1391 PciBarRegionType::IoRegion,
1392 PciBarPrefetchable::NotPrefetchable,
1393 ));
1394 }
1395
1396 if is_64bit {
1397 i += 2;
1398 } else {
1399 i += 1;
1400 }
1401 }
1402 mem_bars
1403 }
1404
configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64)1405 fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1406 let offset: u32 = bar_info.reg_index() as u32 * 4;
1407 let mmio_region = *bar_info;
1408 self.mmio_regions.push(mmio_region.set_address(bar_addr));
1409
1410 let val: u32 = self.config.read_config(offset);
1411 let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1412 self.config.write_config(low, offset);
1413 if bar_info.is_64bit_memory() {
1414 let upper = (bar_addr >> 32) as u32;
1415 self.config.write_config(upper, offset + 4);
1416 }
1417 }
1418
allocate_root_barmem( &mut self, mem_bars: &[PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1419 fn allocate_root_barmem(
1420 &mut self,
1421 mem_bars: &[PciBarConfiguration],
1422 resources: &mut SystemAllocator,
1423 ) -> Result<Vec<BarRange>, PciDeviceError> {
1424 let address = self.pci_address.unwrap();
1425 let mut ranges: Vec<BarRange> = Vec::new();
1426 for mem_bar in mem_bars {
1427 let bar_size = mem_bar.size();
1428 let mut bar_addr: u64 = 0;
1429 // Don't allocate mmio for hotplug device, OS will allocate it from
1430 // its parent's bridge window.
1431 if !self.hotplug {
1432 bar_addr = resources
1433 .allocate_mmio(
1434 bar_size,
1435 Alloc::PciBar {
1436 bus: address.bus,
1437 dev: address.dev,
1438 func: address.func,
1439 bar: mem_bar.bar_index() as u8,
1440 },
1441 "vfio_bar".to_string(),
1442 AllocOptions::new()
1443 .prefetchable(mem_bar.is_prefetchable())
1444 .max_address(if mem_bar.is_64bit_memory() {
1445 u64::MAX
1446 } else {
1447 u32::MAX.into()
1448 })
1449 .align(bar_size),
1450 )
1451 .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1452 ranges.push(BarRange {
1453 addr: bar_addr,
1454 size: bar_size,
1455 prefetchable: mem_bar.is_prefetchable(),
1456 });
1457 }
1458 self.configure_barmem(mem_bar, bar_addr);
1459 }
1460 Ok(ranges)
1461 }
1462
allocate_nonroot_barmem( &mut self, mem_bars: &mut [PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1463 fn allocate_nonroot_barmem(
1464 &mut self,
1465 mem_bars: &mut [PciBarConfiguration],
1466 resources: &mut SystemAllocator,
1467 ) -> Result<Vec<BarRange>, PciDeviceError> {
1468 const NON_PREFETCHABLE: usize = 0;
1469 const PREFETCHABLE: usize = 1;
1470 const ARRAY_SIZE: usize = 2;
1471 let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1472 let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1473 match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1474 Ok(a) => a,
1475 Err(e) => {
1476 error!(
1477 "{} init nonroot VfioResourceAllocator failed: {}",
1478 self.debug_label(),
1479 e
1480 );
1481 return Err(e);
1482 }
1483 },
1484 match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1485 Ok(a) => a,
1486 Err(e) => {
1487 error!(
1488 "{} init nonroot VfioResourceAllocator failed: {}",
1489 self.debug_label(),
1490 e
1491 );
1492 return Err(e);
1493 }
1494 },
1495 ];
1496 let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1497 // the window must be 1M-aligned as per the PCI spec
1498 let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1499 let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1500
1501 // Descend by bar size, this could reduce allocated size for all the bars.
1502 mem_bars.sort_by_key(|a| Reverse(a.size()));
1503 for mem_bar in mem_bars {
1504 let prefetchable = mem_bar.is_prefetchable();
1505 let is_64bit = mem_bar.is_64bit_memory();
1506
1507 // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1508 // as all the prefetchable bars should be in one region
1509 if prefetchable && !is_64bit {
1510 memtype[PREFETCHABLE] = MmioType::Low;
1511 }
1512 let i = if prefetchable {
1513 PREFETCHABLE
1514 } else {
1515 NON_PREFETCHABLE
1516 };
1517 let bar_size = mem_bar.size();
1518 let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1519 Ok(s) => s,
1520 Err(e) => {
1521 error!(
1522 "{} nonroot allocate_wit_align failed: {}",
1523 self.debug_label(),
1524 e
1525 );
1526 return Err(e);
1527 }
1528 };
1529 window_sz[i] = max(window_sz[i], start + bar_size);
1530 alignment[i] = max(alignment[i], bar_size);
1531 let mem_info = (*mem_bar).set_address(start);
1532 membars[i].push(mem_info);
1533 }
1534
1535 let address = self.pci_address.unwrap();
1536 let mut ranges: Vec<BarRange> = Vec::new();
1537 for (index, bars) in membars.iter().enumerate() {
1538 if bars.is_empty() {
1539 continue;
1540 }
1541
1542 let i = if index == 1 {
1543 PREFETCHABLE
1544 } else {
1545 NON_PREFETCHABLE
1546 };
1547 let mut window_addr: u64 = 0;
1548 // Don't allocate mmio for hotplug device, OS will allocate it from
1549 // its parent's bridge window.
1550 if !self.hotplug {
1551 window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1552 let alloc = if i == NON_PREFETCHABLE {
1553 Alloc::PciBridgeWindow {
1554 bus: address.bus,
1555 dev: address.dev,
1556 func: address.func,
1557 }
1558 } else {
1559 Alloc::PciBridgePrefetchWindow {
1560 bus: address.bus,
1561 dev: address.dev,
1562 func: address.func,
1563 }
1564 };
1565 window_addr = resources
1566 .mmio_allocator(memtype[i])
1567 .allocate_with_align(
1568 window_sz[i],
1569 alloc,
1570 "vfio_bar_window".to_string(),
1571 alignment[i],
1572 )
1573 .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1574 for mem_info in bars {
1575 let bar_addr = window_addr + mem_info.address();
1576 ranges.push(BarRange {
1577 addr: bar_addr,
1578 size: mem_info.size(),
1579 prefetchable: mem_info.is_prefetchable(),
1580 });
1581 }
1582 }
1583
1584 for mem_info in bars {
1585 let bar_addr = window_addr + mem_info.address();
1586 self.configure_barmem(mem_info, bar_addr);
1587 }
1588 }
1589 Ok(ranges)
1590 }
1591
1592 /// Return the supported iova max address of the Vfio Pci device
get_max_iova(&self) -> u641593 pub fn get_max_iova(&self) -> u64 {
1594 self.device.get_max_addr()
1595 }
1596
get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap>1597 fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1598 self.ext_caps
1599 .iter()
1600 .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1601 .cloned()
1602 }
1603
is_skipped_reg(&self, reg: u32) -> bool1604 fn is_skipped_reg(&self, reg: u32) -> bool {
1605 // fast handle for pci config space
1606 if reg < PCI_CONFIG_SPACE_SIZE {
1607 return false;
1608 }
1609
1610 self.get_ext_cap_by_reg(reg)
1611 .is_some_and(|cap| cap.is_skipped)
1612 }
1613 }
1614
1615 impl PciDevice for VfioPciDevice {
debug_label(&self) -> String1616 fn debug_label(&self) -> String {
1617 format!("vfio {} device", self.device.device_name())
1618 }
1619
preferred_address(&self) -> Option<PciAddress>1620 fn preferred_address(&self) -> Option<PciAddress> {
1621 Some(self.preferred_address)
1622 }
1623
allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>1624 fn allocate_address(
1625 &mut self,
1626 resources: &mut SystemAllocator,
1627 ) -> Result<PciAddress, PciDeviceError> {
1628 if self.pci_address.is_none() {
1629 let mut address = self.preferred_address;
1630 while address.func < 8 {
1631 if resources.reserve_pci(address, self.debug_label()) {
1632 self.pci_address = Some(address);
1633 break;
1634 } else if self.hotplug_bus_number.is_none() {
1635 break;
1636 } else {
1637 address.func += 1;
1638 }
1639 }
1640 }
1641 self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1642 }
1643
keep_rds(&self) -> Vec<RawDescriptor>1644 fn keep_rds(&self) -> Vec<RawDescriptor> {
1645 let mut rds = self.device.keep_rds();
1646 if let Some(ref interrupt_evt) = self.interrupt_evt {
1647 rds.extend(interrupt_evt.as_raw_descriptors());
1648 }
1649 rds.push(self.vm_memory_client.as_raw_descriptor());
1650 if let Some(vm_socket_vm) = &self.vm_socket_vm {
1651 rds.push(vm_socket_vm.as_raw_descriptor());
1652 }
1653 if let Some(msi_cap) = &self.msi_cap {
1654 rds.push(msi_cap.config.get_msi_socket());
1655 }
1656 if let Some(msix_cap) = &self.msix_cap {
1657 rds.push(msix_cap.lock().config.as_raw_descriptor());
1658 }
1659 rds
1660 }
1661
preferred_irq(&self) -> PreferredIrq1662 fn preferred_irq(&self) -> PreferredIrq {
1663 // Is INTx configured?
1664 let pin = match self.config.read_config::<u8>(PCI_INTERRUPT_PIN) {
1665 1 => PciInterruptPin::IntA,
1666 2 => PciInterruptPin::IntB,
1667 3 => PciInterruptPin::IntC,
1668 4 => PciInterruptPin::IntD,
1669 _ => return PreferredIrq::None,
1670 };
1671
1672 // TODO: replace sysfs/irq value parsing with vfio interface
1673 // reporting host allocated interrupt number and type.
1674 let path = self.sysfs_path.join("irq");
1675 let gsi = fs::read_to_string(path)
1676 .map(|v| v.trim().parse::<u32>().unwrap_or(0))
1677 .unwrap_or(0);
1678
1679 PreferredIrq::Fixed { pin, gsi }
1680 }
1681
assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32)1682 fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1683 // Keep event/resample event references.
1684 self.interrupt_evt = Some(irq_evt);
1685
1686 // enable INTX
1687 self.enable_intx();
1688
1689 self.config
1690 .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1691 self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1692 }
1693
allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1694 fn allocate_io_bars(
1695 &mut self,
1696 resources: &mut SystemAllocator,
1697 ) -> Result<Vec<BarRange>, PciDeviceError> {
1698 let address = self
1699 .pci_address
1700 .expect("allocate_address must be called prior to allocate_device_bars");
1701
1702 let mut mem_bars = self.collect_bars();
1703
1704 let ranges = if address.bus == 0 {
1705 self.allocate_root_barmem(&mem_bars, resources)?
1706 } else {
1707 self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1708 };
1709
1710 // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1711 // driver doesn't claim this vga device, then xorg couldn't boot up.
1712 if self.is_intel_gfx() {
1713 let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1714 cmd |= PCI_COMMAND_MEMORY;
1715 self.config.write_config(cmd, PCI_COMMAND);
1716 }
1717 Ok(ranges)
1718 }
1719
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1720 fn allocate_device_bars(
1721 &mut self,
1722 resources: &mut SystemAllocator,
1723 ) -> Result<Vec<BarRange>, PciDeviceError> {
1724 let mut ranges: Vec<BarRange> = Vec::new();
1725
1726 if !self.is_intel_gfx() {
1727 return Ok(ranges);
1728 }
1729
1730 // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1731 // then write this gpa into pci cfg register
1732 if let Some((index, size)) = self.device.get_cap_type_info(
1733 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1734 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1735 ) {
1736 let address = self
1737 .pci_address
1738 .expect("allocate_address must be called prior to allocate_device_bars");
1739 let bar_addr = resources
1740 .allocate_mmio(
1741 size,
1742 Alloc::PciBar {
1743 bus: address.bus,
1744 dev: address.dev,
1745 func: address.func,
1746 bar: (index * 4) as u8,
1747 },
1748 "vfio_bar".to_string(),
1749 AllocOptions::new().max_address(u32::MAX.into()),
1750 )
1751 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1752 ranges.push(BarRange {
1753 addr: bar_addr,
1754 size,
1755 prefetchable: false,
1756 });
1757 self.device_data = Some(DeviceData::IntelGfxData {
1758 opregion_index: index,
1759 });
1760
1761 self.mmio_regions.push(
1762 PciBarConfiguration::new(
1763 index as usize,
1764 size,
1765 PciBarRegionType::Memory32BitRegion,
1766 PciBarPrefetchable::NotPrefetchable,
1767 )
1768 .set_address(bar_addr),
1769 );
1770 self.config.write_config(bar_addr as u32, 0xFC);
1771 }
1772
1773 Ok(ranges)
1774 }
1775
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1776 fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1777 for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1778 if region.bar_index() == bar_num {
1779 let command: u8 = self.config.read_config(PCI_COMMAND);
1780 if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1781 return None;
1782 } else {
1783 return Some(*region);
1784 }
1785 }
1786 }
1787
1788 None
1789 }
1790
register_device_capabilities(&mut self) -> Result<(), PciDeviceError>1791 fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1792 Ok(())
1793 }
1794
read_config_register(&self, reg_idx: usize) -> u321795 fn read_config_register(&self, reg_idx: usize) -> u32 {
1796 let reg: u32 = (reg_idx * 4) as u32;
1797 let mut config: u32 = self.config.read_config(reg);
1798
1799 // See VfioPciDevice::new for details how extended caps are managed
1800 if reg >= PCI_CONFIG_SPACE_SIZE {
1801 let ext_cap = self.get_ext_cap_by_reg(reg);
1802 if let Some(ext_cap) = ext_cap {
1803 if ext_cap.offset == reg {
1804 config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1805 }
1806
1807 if ext_cap.is_skipped {
1808 if reg == PCI_CONFIG_SPACE_SIZE {
1809 config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1810 } else {
1811 config = 0;
1812 }
1813 }
1814 }
1815 }
1816
1817 // Ignore IO bar
1818 if (0x10..=0x24).contains(®) {
1819 let bar_idx = (reg as usize - 0x10) / 4;
1820 if let Some(bar) = self.get_bar_configuration(bar_idx) {
1821 if bar.is_io() {
1822 config = 0;
1823 }
1824 }
1825 } else if let Some(msix_cap) = &self.msix_cap {
1826 let msix_cap = msix_cap.lock();
1827 if msix_cap.is_msix_control_reg(reg, 4) {
1828 msix_cap.read_msix_control(&mut config);
1829 }
1830 } else if let Some(pm_cap) = &self.pm_cap {
1831 let pm_cap = pm_cap.lock();
1832 if pm_cap.is_pm_reg(reg) {
1833 config = pm_cap.read(reg);
1834 }
1835 }
1836
1837 // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1838 if self.is_intel_gfx() && reg == 0x50 {
1839 config &= 0xffff00ff;
1840 }
1841
1842 config
1843 }
1844
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1845 fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1846 // When guest write config register at the first time, start worker thread
1847 if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1848 self.start_work_thread();
1849 };
1850
1851 let start = (reg_idx * 4) as u64 + offset;
1852
1853 if let Some(pm_cap) = self.pm_cap.as_mut() {
1854 let mut pm_cap = pm_cap.lock();
1855 if pm_cap.is_pm_reg(start as u32) {
1856 pm_cap.write(start, data);
1857 }
1858 }
1859
1860 let mut msi_change: Option<VfioMsiChange> = None;
1861 if let Some(msi_cap) = self.msi_cap.as_mut() {
1862 if msi_cap.is_msi_reg(start, data.len()) {
1863 msi_change = msi_cap.write_msi_reg(start, data);
1864 }
1865 }
1866
1867 match msi_change {
1868 Some(VfioMsiChange::Enable) => self.enable_msi(),
1869 Some(VfioMsiChange::Disable) => self.disable_msi(),
1870 _ => (),
1871 }
1872
1873 msi_change = None;
1874 if let Some(msix_cap) = &self.msix_cap {
1875 let mut msix_cap = msix_cap.lock();
1876 if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
1877 msi_change = msix_cap.write_msix_control(data);
1878 }
1879 }
1880
1881 match msi_change {
1882 Some(VfioMsiChange::Enable) => self.enable_msix(),
1883 Some(VfioMsiChange::Disable) => self.disable_msix(),
1884 Some(VfioMsiChange::FunctionChanged) => {
1885 if let Err(e) = self.msix_vectors_update() {
1886 error!("update msix vectors failed: {}", e);
1887 }
1888 }
1889 _ => (),
1890 }
1891
1892 if !self.is_skipped_reg(start as u32) {
1893 self.device
1894 .region_write(VFIO_PCI_CONFIG_REGION_INDEX as usize, data, start);
1895 }
1896
1897 // if guest enable memory access, then enable bar mappable once
1898 if start == PCI_COMMAND as u64
1899 && data.len() == 2
1900 && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
1901 {
1902 self.commit_bars_mmap();
1903 } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
1904 let bar_idx = (start as u32 - 0x10) / 4;
1905 let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
1906 let val = u32::from_le_bytes(value);
1907 let mut modify = false;
1908 for region in self.mmio_regions.iter_mut() {
1909 if region.bar_index() == bar_idx as usize {
1910 let old_addr = region.address();
1911 let new_addr = val & 0xFFFFFFF0;
1912 if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
1913 // Change 32bit bar address
1914 *region = region.set_address(u64::from(new_addr));
1915 modify = true;
1916 } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
1917 // Change 64bit bar low address
1918 *region =
1919 region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
1920 modify = true;
1921 }
1922 break;
1923 } else if region.is_64bit_memory()
1924 && ((bar_idx % 2) == 1)
1925 && (region.bar_index() + 1 == bar_idx as usize)
1926 {
1927 // Change 64bit bar high address
1928 let old_addr = region.address();
1929 if val != (old_addr >> 32) as u32 {
1930 let mut new_addr = (u64::from(val)) << 32;
1931 new_addr |= old_addr & 0xFFFFFFFF;
1932 *region = region.set_address(new_addr);
1933 modify = true;
1934 }
1935 break;
1936 }
1937 }
1938 if modify {
1939 // if bar is changed under memory enabled, mmap the
1940 // new bar immediately.
1941 let cmd = self.config.read_config::<u8>(PCI_COMMAND);
1942 if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
1943 self.commit_bars_mmap();
1944 }
1945 }
1946 }
1947 }
1948
read_virtual_config_register(&self, reg_idx: usize) -> u321949 fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
1950 if reg_idx == PCI_VCFG_NOTY {
1951 let mut q = self.acpi_notifier_val.lock();
1952 let mut val = 0;
1953 if !q.is_empty() {
1954 val = q.remove(0);
1955 }
1956 drop(q);
1957 return val;
1958 }
1959
1960 warn!(
1961 "{} read unsupported vcfg register {}",
1962 self.debug_label(),
1963 reg_idx
1964 );
1965 0xFFFF_FFFF
1966 }
1967
write_virtual_config_register(&mut self, reg_idx: usize, value: u32)1968 fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
1969 match reg_idx {
1970 PCI_VCFG_PM => {
1971 match value {
1972 0 => {
1973 if let Some(pm_evt) =
1974 self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
1975 {
1976 *self.is_in_low_power.lock() = true;
1977 let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
1978 } else {
1979 let _ = self.device.pm_low_power_enter();
1980 }
1981 }
1982 _ => {
1983 *self.is_in_low_power.lock() = false;
1984 let _ = self.device.pm_low_power_exit();
1985 }
1986 };
1987 }
1988 PCI_VCFG_DSM => {
1989 if let Some(shm) = &self.vcfg_shm_mmap {
1990 let mut args = [0u8; 4096];
1991 if let Err(e) = shm.read_slice(&mut args, 0) {
1992 error!("failed to read DSM Args: {}", e);
1993 return;
1994 }
1995 let res = match self.device.acpi_dsm(&args) {
1996 Ok(r) => r,
1997 Err(e) => {
1998 error!("failed to call DSM: {}", e);
1999 return;
2000 }
2001 };
2002 if let Err(e) = shm.write_slice(&res, 0) {
2003 error!("failed to write DSM result: {}", e);
2004 return;
2005 }
2006 if let Err(e) = shm.msync() {
2007 error!("failed to msync: {}", e)
2008 }
2009 }
2010 }
2011 _ => warn!(
2012 "{} write unsupported vcfg register {}",
2013 self.debug_label(),
2014 reg_idx
2015 ),
2016 };
2017 }
2018
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])2019 fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
2020 if let Some(msix_cap) = &self.msix_cap {
2021 let msix_cap = msix_cap.lock();
2022 if msix_cap.is_msix_table(bar_index, offset) {
2023 msix_cap.read_table(offset, data);
2024 return;
2025 } else if msix_cap.is_msix_pba(bar_index, offset) {
2026 msix_cap.read_pba(offset, data);
2027 return;
2028 }
2029 }
2030 self.device.region_read(bar_index, data, offset);
2031 }
2032
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])2033 fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
2034 // Ignore igd opregion's write
2035 if let Some(device_data) = &self.device_data {
2036 match *device_data {
2037 DeviceData::IntelGfxData { opregion_index } => {
2038 if opregion_index == bar_index as u32 {
2039 return;
2040 }
2041 }
2042 }
2043 }
2044
2045 if let Some(msix_cap) = &self.msix_cap {
2046 let mut msix_cap = msix_cap.lock();
2047 if msix_cap.is_msix_table(bar_index, offset) {
2048 let behavior = msix_cap.write_table(offset, data);
2049 if let MsixStatus::EntryChanged(index) = behavior {
2050 let irqfd = msix_cap.get_msix_irqfd(index);
2051 self.msix_vector_update(index, irqfd);
2052 }
2053 return;
2054 } else if msix_cap.is_msix_pba(bar_index, offset) {
2055 msix_cap.write_pba(offset, data);
2056 return;
2057 }
2058 }
2059
2060 self.device.region_write(bar_index, data, offset);
2061 }
2062
destroy_device(&mut self)2063 fn destroy_device(&mut self) {
2064 self.close();
2065 }
2066
generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>)2067 fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2068 let mut amls = Vec::new();
2069 let mut shm = None;
2070 if let Some(pci_address) = self.pci_address {
2071 let vcfg_offset = pci_address.to_config_address(0, 13);
2072 if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2073 vcfg_register.to_aml_bytes(&mut amls);
2074 shm = vcfg_register
2075 .create_shm_mmap()
2076 .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2077 self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2078 // All vfio-pci devices should have virtual _PRx method, otherwise
2079 // host couldn't know whether device has enter into suspend state,
2080 // host would always think it is in active state, so its parent PCIe
2081 // switch couldn't enter into suspend state.
2082 PowerResourceMethod {}.to_aml_bytes(&mut amls);
2083 // TODO: WIP: Ideally, we should generate DSM only if the physical
2084 // device has a _DSM; however, such information is not provided by
2085 // Linux. As a temporary workaround, we chech whether there is an
2086 // associated ACPI companion device node and skip generating guest
2087 // _DSM if there is none.
2088 let acpi_path = self.sysfs_path.join("firmware_node/path");
2089 if acpi_path.exists() {
2090 DsmMethod {}.to_aml_bytes(&mut amls);
2091 }
2092 }
2093 }
2094
2095 (amls, shm)
2096 }
2097
set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32>2098 fn set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32> {
2099 if let Some(gpe_nr) = resources.allocate_gpe() {
2100 base::debug!("set_gpe: gpe-nr {} addr {:?}", gpe_nr, self.pci_address);
2101 self.gpe = Some(gpe_nr);
2102 }
2103 self.gpe
2104 }
2105 }
2106
2107 impl Suspendable for VfioPciDevice {
sleep(&mut self) -> anyhow::Result<()>2108 fn sleep(&mut self) -> anyhow::Result<()> {
2109 if let Some(worker_thread) = self.worker_thread.take() {
2110 let res = worker_thread.stop();
2111 self.pci_address = Some(res.address);
2112 self.sysfs_path = res.sysfs_path;
2113 self.pm_cap = res.pm_cap;
2114 self.msix_cap = res.msix_cap;
2115 self.vm_socket_vm = Some(res.vm_socket);
2116 }
2117 Ok(())
2118 }
2119
wake(&mut self) -> anyhow::Result<()>2120 fn wake(&mut self) -> anyhow::Result<()> {
2121 if self.activated {
2122 self.start_work_thread();
2123 }
2124 Ok(())
2125 }
2126 }
2127
2128 #[cfg(test)]
2129 mod tests {
2130 use resources::AddressRange;
2131
2132 use super::VfioResourceAllocator;
2133
2134 #[test]
no_overlap()2135 fn no_overlap() {
2136 // regions [32, 95]
2137 let mut memory =
2138 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2139 memory
2140 .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2141 .unwrap();
2142 memory
2143 .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2144 .unwrap();
2145
2146 let mut iter = memory.regions.iter();
2147 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2148 }
2149
2150 #[test]
complete_overlap()2151 fn complete_overlap() {
2152 // regions [32, 95]
2153 let mut memory =
2154 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2155 // regions [32, 47], [64, 95]
2156 memory
2157 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2158 .unwrap();
2159 // regions [64, 95]
2160 memory
2161 .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2162 .unwrap();
2163
2164 let mut iter = memory.regions.iter();
2165 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2166 }
2167
2168 #[test]
partial_overlap_one()2169 fn partial_overlap_one() {
2170 // regions [32, 95]
2171 let mut memory =
2172 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2173 // regions [32, 47], [64, 95]
2174 memory
2175 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2176 .unwrap();
2177 // regions [32, 39], [64, 95]
2178 memory
2179 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2180 .unwrap();
2181
2182 let mut iter = memory.regions.iter();
2183 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2184 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2185 }
2186
2187 #[test]
partial_overlap_two()2188 fn partial_overlap_two() {
2189 // regions [32, 95]
2190 let mut memory =
2191 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2192 // regions [32, 47], [64, 95]
2193 memory
2194 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2195 .unwrap();
2196 // regions [32, 39], [72, 95]
2197 memory
2198 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2199 .unwrap();
2200
2201 let mut iter = memory.regions.iter();
2202 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2203 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2204 }
2205
2206 #[test]
partial_overlap_three()2207 fn partial_overlap_three() {
2208 // regions [32, 95]
2209 let mut memory =
2210 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2211 // regions [32, 39], [48, 95]
2212 memory
2213 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2214 .unwrap();
2215 // regions [32, 39], [48, 63], [72, 95]
2216 memory
2217 .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2218 .unwrap();
2219 // regions [32, 35], [76, 95]
2220 memory
2221 .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2222 .unwrap();
2223
2224 let mut iter = memory.regions.iter();
2225 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2226 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2227 }
2228 }
2229