1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::cmp::max;
6 use std::cmp::Reverse;
7 use std::collections::BTreeMap;
8 use std::collections::BTreeSet;
9 use std::fs;
10 use std::path::Path;
11 use std::path::PathBuf;
12 use std::str::FromStr;
13 use std::sync::Arc;
14 use std::u32;
15
16 use acpi_tables::aml::Aml;
17 use base::debug;
18 use base::error;
19 use base::pagesize;
20 use base::warn;
21 use base::AsRawDescriptor;
22 use base::AsRawDescriptors;
23 use base::Event;
24 use base::EventToken;
25 use base::MemoryMapping;
26 use base::Protection;
27 use base::RawDescriptor;
28 use base::Tube;
29 use base::WaitContext;
30 use base::WorkerThread;
31 use hypervisor::MemCacheType;
32 use resources::AddressRange;
33 use resources::Alloc;
34 use resources::AllocOptions;
35 use resources::MmioType;
36 use resources::SystemAllocator;
37 use sync::Mutex;
38 use vfio_sys::vfio::VFIO_PCI_ACPI_NTFY_IRQ_INDEX;
39 use vfio_sys::*;
40 use vm_control::api::VmMemoryClient;
41 use vm_control::HotPlugDeviceInfo;
42 use vm_control::HotPlugDeviceType;
43 use vm_control::VmMemoryDestination;
44 use vm_control::VmMemoryRegionId;
45 use vm_control::VmMemorySource;
46 use vm_control::VmRequest;
47 use vm_control::VmResponse;
48
49 use crate::pci::acpi::DeviceVcfgRegister;
50 use crate::pci::acpi::DsmMethod;
51 use crate::pci::acpi::PowerResourceMethod;
52 use crate::pci::acpi::SHM_OFFSET;
53 use crate::pci::msi::MsiConfig;
54 use crate::pci::msi::MsiStatus;
55 use crate::pci::msi::PCI_MSI_FLAGS;
56 use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
57 use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
58 use crate::pci::msi::PCI_MSI_NEXT_POINTER;
59 use crate::pci::msix::MsixConfig;
60 use crate::pci::msix::MsixStatus;
61 use crate::pci::msix::BITS_PER_PBA_ENTRY;
62 use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
63 use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
64 use crate::pci::pci_device::BarRange;
65 use crate::pci::pci_device::Error as PciDeviceError;
66 use crate::pci::pci_device::PciDevice;
67 use crate::pci::pci_device::PreferredIrq;
68 use crate::pci::pm::PciPmCap;
69 use crate::pci::pm::PmConfig;
70 use crate::pci::pm::PM_CAP_LENGTH;
71 use crate::pci::PciAddress;
72 use crate::pci::PciBarConfiguration;
73 use crate::pci::PciBarIndex;
74 use crate::pci::PciBarPrefetchable;
75 use crate::pci::PciBarRegionType;
76 use crate::pci::PciCapabilityID;
77 use crate::pci::PciClassCode;
78 use crate::pci::PciId;
79 use crate::pci::PciInterruptPin;
80 use crate::pci::PCI_VCFG_DSM;
81 use crate::pci::PCI_VCFG_NOTY;
82 use crate::pci::PCI_VCFG_PM;
83 use crate::pci::PCI_VENDOR_ID_INTEL;
84 use crate::vfio::VfioDevice;
85 use crate::vfio::VfioError;
86 use crate::vfio::VfioIrqType;
87 use crate::vfio::VfioPciConfig;
88 use crate::IrqLevelEvent;
89 use crate::Suspendable;
90
91 const PCI_VENDOR_ID: u32 = 0x0;
92 const PCI_DEVICE_ID: u32 = 0x2;
93 const PCI_COMMAND: u32 = 0x4;
94 const PCI_COMMAND_MEMORY: u8 = 0x2;
95 const PCI_BASE_CLASS_CODE: u32 = 0x0B;
96 const PCI_INTERRUPT_NUM: u32 = 0x3C;
97 const PCI_INTERRUPT_PIN: u32 = 0x3D;
98
99 const PCI_CAPABILITY_LIST: u32 = 0x34;
100 const PCI_CAP_ID_MSI: u8 = 0x05;
101 const PCI_CAP_ID_MSIX: u8 = 0x11;
102 const PCI_CAP_ID_PM: u8 = 0x01;
103
104 // Size of the standard PCI config space
105 const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
106 // Size of the standard PCIe config space: 4KB
107 const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
108
109 // Extended Capabilities
110 const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
111 const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
112 const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
113 const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
114
115 struct VfioPmCap {
116 offset: u32,
117 capabilities: u32,
118 config: PmConfig,
119 }
120
121 impl VfioPmCap {
new(config: &VfioPciConfig, cap_start: u32) -> Self122 fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
123 let mut capabilities: u32 = config.read_config(cap_start);
124 capabilities |= (PciPmCap::default_cap() as u32) << 16;
125 VfioPmCap {
126 offset: cap_start,
127 capabilities,
128 config: PmConfig::new(false),
129 }
130 }
131
should_trigger_pme(&mut self) -> bool132 pub fn should_trigger_pme(&mut self) -> bool {
133 self.config.should_trigger_pme()
134 }
135
is_pm_reg(&self, offset: u32) -> bool136 fn is_pm_reg(&self, offset: u32) -> bool {
137 (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
138 }
139
read(&self, offset: u32) -> u32140 pub fn read(&self, offset: u32) -> u32 {
141 let offset = offset - self.offset;
142 if offset == 0 {
143 self.capabilities
144 } else {
145 let mut data = 0;
146 self.config.read(&mut data);
147 data
148 }
149 }
150
write(&mut self, offset: u64, data: &[u8])151 pub fn write(&mut self, offset: u64, data: &[u8]) {
152 let offset = offset - self.offset as u64;
153 if offset >= std::mem::size_of::<u32>() as u64 {
154 let offset = offset - std::mem::size_of::<u32>() as u64;
155 self.config.write(offset, data);
156 }
157 }
158 }
159
160 enum VfioMsiChange {
161 Disable,
162 Enable,
163 FunctionChanged,
164 }
165
166 struct VfioMsiCap {
167 config: MsiConfig,
168 offset: u32,
169 }
170
171 impl VfioMsiCap {
new( config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube, device_id: u32, device_name: String, ) -> Self172 fn new(
173 config: &VfioPciConfig,
174 msi_cap_start: u32,
175 vm_socket_irq: Tube,
176 device_id: u32,
177 device_name: String,
178 ) -> Self {
179 let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
180 let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
181 let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
182
183 VfioMsiCap {
184 config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
185 offset: msi_cap_start,
186 }
187 }
188
is_msi_reg(&self, index: u64, len: usize) -> bool189 fn is_msi_reg(&self, index: u64, len: usize) -> bool {
190 self.config.is_msi_reg(self.offset, index, len)
191 }
192
write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>193 fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
194 let offset = index as u32 - self.offset;
195 match self.config.write_msi_capability(offset, data) {
196 MsiStatus::Enabled => Some(VfioMsiChange::Enable),
197 MsiStatus::Disabled => Some(VfioMsiChange::Disable),
198 MsiStatus::NothingToDo => None,
199 }
200 }
201
get_msi_irqfd(&self) -> Option<&Event>202 fn get_msi_irqfd(&self) -> Option<&Event> {
203 self.config.get_irqfd()
204 }
205
destroy(&mut self)206 fn destroy(&mut self) {
207 self.config.destroy()
208 }
209 }
210
211 // MSI-X registers in MSI-X capability
212 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
213 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
214 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
215 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
216 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
217 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
218 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
219 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
220
221 struct VfioMsixCap {
222 config: MsixConfig,
223 offset: u32,
224 table_size: u16,
225 table_pci_bar: PciBarIndex,
226 table_offset: u64,
227 table_size_bytes: u64,
228 pba_pci_bar: PciBarIndex,
229 pba_offset: u64,
230 pba_size_bytes: u64,
231 msix_interrupt_evt: Vec<Event>,
232 }
233
234 impl VfioMsixCap {
new( config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube, pci_id: u32, device_name: String, ) -> Self235 fn new(
236 config: &VfioPciConfig,
237 msix_cap_start: u32,
238 vm_socket_irq: Tube,
239 pci_id: u32,
240 device_name: String,
241 ) -> Self {
242 let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
243 let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
244 let table_pci_bar = (table & PCI_MSIX_TABLE_BIR) as PciBarIndex;
245 let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
246 let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
247 let pba_pci_bar = (pba & PCI_MSIX_PBA_BIR) as PciBarIndex;
248 let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
249
250 let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
251 if table_pci_bar == pba_pci_bar
252 && pba_offset > table_offset
253 && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
254 {
255 table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
256 }
257
258 let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
259 let pba_size_bytes = ((table_size + BITS_PER_PBA_ENTRY as u64 - 1)
260 / BITS_PER_PBA_ENTRY as u64)
261 * MSIX_PBA_ENTRIES_MODULO;
262 let mut msix_interrupt_evt = Vec::new();
263 for _ in 0..table_size {
264 msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
265 }
266 VfioMsixCap {
267 config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
268 offset: msix_cap_start,
269 table_size: table_size as u16,
270 table_pci_bar,
271 table_offset,
272 table_size_bytes,
273 pba_pci_bar,
274 pba_offset,
275 pba_size_bytes,
276 msix_interrupt_evt,
277 }
278 }
279
280 // only msix control register is writable and need special handle in pci r/w
is_msix_control_reg(&self, offset: u32, size: u32) -> bool281 fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
282 let control_start = self.offset + PCI_MSIX_FLAGS;
283 let control_end = control_start + 2;
284
285 offset < control_end && offset + size > control_start
286 }
287
read_msix_control(&self, data: &mut u32)288 fn read_msix_control(&self, data: &mut u32) {
289 *data = self.config.read_msix_capability(*data);
290 }
291
write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>292 fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
293 let old_enabled = self.config.enabled();
294 let old_masked = self.config.masked();
295
296 self.config
297 .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
298
299 let new_enabled = self.config.enabled();
300 let new_masked = self.config.masked();
301
302 if !old_enabled && new_enabled {
303 Some(VfioMsiChange::Enable)
304 } else if old_enabled && !new_enabled {
305 Some(VfioMsiChange::Disable)
306 } else if new_enabled && old_masked != new_masked {
307 Some(VfioMsiChange::FunctionChanged)
308 } else {
309 None
310 }
311 }
312
is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool313 fn is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool {
314 bar_index == self.table_pci_bar
315 && offset >= self.table_offset
316 && offset < self.table_offset + self.table_size_bytes
317 }
318
get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange>319 fn get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
320 if bar_index == self.table_pci_bar {
321 AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
322 } else {
323 None
324 }
325 }
326
read_table(&self, offset: u64, data: &mut [u8])327 fn read_table(&self, offset: u64, data: &mut [u8]) {
328 let offset = offset - self.table_offset;
329 self.config.read_msix_table(offset, data);
330 }
331
write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus332 fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
333 let offset = offset - self.table_offset;
334 self.config.write_msix_table(offset, data)
335 }
336
is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool337 fn is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool {
338 bar_index == self.pba_pci_bar
339 && offset >= self.pba_offset
340 && offset < self.pba_offset + self.pba_size_bytes
341 }
342
get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange>343 fn get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
344 if bar_index == self.pba_pci_bar {
345 AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
346 } else {
347 None
348 }
349 }
350
read_pba(&self, offset: u64, data: &mut [u8])351 fn read_pba(&self, offset: u64, data: &mut [u8]) {
352 let offset = offset - self.pba_offset;
353 self.config.read_pba_entries(offset, data);
354 }
355
write_pba(&mut self, offset: u64, data: &[u8])356 fn write_pba(&mut self, offset: u64, data: &[u8]) {
357 let offset = offset - self.pba_offset;
358 self.config.write_pba_entries(offset, data);
359 }
360
get_msix_irqfd(&self, index: usize) -> Option<&Event>361 fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
362 let irqfd = self.config.get_irqfd(index);
363 if let Some(fd) = irqfd {
364 if self.msix_vector_masked(index) {
365 Some(&self.msix_interrupt_evt[index])
366 } else {
367 Some(fd)
368 }
369 } else {
370 None
371 }
372 }
373
get_msix_irqfds(&self) -> Vec<Option<&Event>>374 fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
375 let mut irqfds = Vec::new();
376
377 for i in 0..self.table_size {
378 irqfds.push(self.get_msix_irqfd(i as usize));
379 }
380
381 irqfds
382 }
383
table_size(&self) -> usize384 fn table_size(&self) -> usize {
385 self.table_size.into()
386 }
387
clone_msix_evt(&self) -> Vec<Event>388 fn clone_msix_evt(&self) -> Vec<Event> {
389 self.msix_interrupt_evt
390 .iter()
391 .map(|irq| irq.try_clone().unwrap())
392 .collect()
393 }
394
msix_vector_masked(&self, index: usize) -> bool395 fn msix_vector_masked(&self, index: usize) -> bool {
396 !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
397 }
398
trigger(&mut self, index: usize)399 fn trigger(&mut self, index: usize) {
400 self.config.trigger(index as u16);
401 }
402
destroy(&mut self)403 fn destroy(&mut self) {
404 self.config.destroy()
405 }
406 }
407
408 struct VfioResourceAllocator {
409 // The region that is not allocated yet.
410 regions: BTreeSet<AddressRange>,
411 }
412
413 impl VfioResourceAllocator {
414 // Creates a new `VfioResourceAllocator` for managing VFIO resources.
415 // Can return `Err` if `base` + `size` overflows a u64.
416 //
417 // * `base` - The starting address of the range to manage.
418 // * `size` - The size of the address range in bytes.
new(pool: AddressRange) -> Result<Self, PciDeviceError>419 fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
420 if pool.is_empty() {
421 return Err(PciDeviceError::SizeZero);
422 }
423 let mut regions = BTreeSet::new();
424 regions.insert(pool);
425 Ok(VfioResourceAllocator { regions })
426 }
427
internal_allocate_from_slot( &mut self, slot: AddressRange, range: AddressRange, ) -> Result<u64, PciDeviceError>428 fn internal_allocate_from_slot(
429 &mut self,
430 slot: AddressRange,
431 range: AddressRange,
432 ) -> Result<u64, PciDeviceError> {
433 let slot_was_present = self.regions.remove(&slot);
434 assert!(slot_was_present);
435
436 let (before, after) = slot.non_overlapping_ranges(range);
437
438 if !before.is_empty() {
439 self.regions.insert(before);
440 }
441 if !after.is_empty() {
442 self.regions.insert(after);
443 }
444
445 Ok(range.start)
446 }
447
448 // Allocates a range of addresses from the managed region with a minimal alignment.
449 // Overlapping with a previous allocation is _not_ allowed.
450 // Returns allocated address.
allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError>451 fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
452 if size == 0 {
453 return Err(PciDeviceError::SizeZero);
454 }
455 if !alignment.is_power_of_two() {
456 return Err(PciDeviceError::BadAlignment);
457 }
458
459 // finds first region matching alignment and size.
460 let region = self.regions.iter().find(|range| {
461 match range.start % alignment {
462 0 => range.start.checked_add(size - 1),
463 r => range.start.checked_add(size - 1 + alignment - r),
464 }
465 .map_or(false, |end| end <= range.end)
466 });
467
468 match region {
469 Some(&slot) => {
470 let start = match slot.start % alignment {
471 0 => slot.start,
472 r => slot.start + alignment - r,
473 };
474 let end = start + size - 1;
475 let range = AddressRange::from_start_and_end(start, end);
476
477 self.internal_allocate_from_slot(slot, range)
478 }
479 None => Err(PciDeviceError::OutOfSpace),
480 }
481 }
482
483 // Allocates a range of addresses from the managed region with a required location.
484 // Overlapping with a previous allocation is allowed.
allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError>485 fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
486 if range.is_empty() {
487 return Err(PciDeviceError::SizeZero);
488 }
489
490 while let Some(&slot) = self
491 .regions
492 .iter()
493 .find(|avail_range| avail_range.overlaps(range))
494 {
495 let _address = self.internal_allocate_from_slot(slot, range)?;
496 }
497 Ok(())
498 }
499 }
500
501 struct VfioPciWorker {
502 address: PciAddress,
503 sysfs_path: PathBuf,
504 vm_socket: Tube,
505 name: String,
506 pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
507 msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
508 }
509
510 impl VfioPciWorker {
run( &mut self, req_irq_evt: Event, wakeup_evt: Event, acpi_notify_evt: Event, kill_evt: Event, msix_evt: Vec<Event>, is_in_low_power: Arc<Mutex<bool>>, gpe: Option<u32>, notification_val: Arc<Mutex<Vec<u32>>>, )511 fn run(
512 &mut self,
513 req_irq_evt: Event,
514 wakeup_evt: Event,
515 acpi_notify_evt: Event,
516 kill_evt: Event,
517 msix_evt: Vec<Event>,
518 is_in_low_power: Arc<Mutex<bool>>,
519 gpe: Option<u32>,
520 notification_val: Arc<Mutex<Vec<u32>>>,
521 ) {
522 #[derive(EventToken, Debug)]
523 enum Token {
524 ReqIrq,
525 WakeUp,
526 AcpiNotifyEvent,
527 Kill,
528 MsixIrqi { index: usize },
529 }
530
531 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
532 (&req_irq_evt, Token::ReqIrq),
533 (&wakeup_evt, Token::WakeUp),
534 (&acpi_notify_evt, Token::AcpiNotifyEvent),
535 (&kill_evt, Token::Kill),
536 ]) {
537 Ok(pc) => pc,
538 Err(e) => {
539 error!(
540 "{} failed creating vfio WaitContext: {}",
541 self.name.clone(),
542 e
543 );
544 return;
545 }
546 };
547
548 for (index, msix_int) in msix_evt.iter().enumerate() {
549 wait_ctx
550 .add(msix_int, Token::MsixIrqi { index })
551 .expect("Failed to create vfio WaitContext for msix interrupt event")
552 }
553
554 'wait: loop {
555 let events = match wait_ctx.wait() {
556 Ok(v) => v,
557 Err(e) => {
558 error!("{} failed polling vfio events: {}", self.name.clone(), e);
559 break;
560 }
561 };
562
563 for event in events.iter().filter(|e| e.is_readable) {
564 match event.token {
565 Token::MsixIrqi { index } => {
566 if let Some(msix_cap) = &self.msix_cap {
567 msix_cap.lock().trigger(index);
568 }
569 }
570 Token::ReqIrq => {
571 let device = HotPlugDeviceInfo {
572 device_type: HotPlugDeviceType::EndPoint,
573 path: self.sysfs_path.clone(),
574 hp_interrupt: false,
575 };
576
577 let request = VmRequest::HotPlugVfioCommand { device, add: false };
578 if self.vm_socket.send(&request).is_ok() {
579 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
580 error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
581 } else {
582 break 'wait;
583 }
584 }
585 }
586 Token::WakeUp => {
587 let _ = wakeup_evt.wait();
588
589 if *is_in_low_power.lock() {
590 if let Some(pm_cap) = &self.pm_cap {
591 if pm_cap.lock().should_trigger_pme() {
592 let request =
593 VmRequest::PciPme(self.address.pme_requester_id());
594 if self.vm_socket.send(&request).is_ok() {
595 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
596 error!(
597 "{} failed to send PME: {}",
598 self.name.clone(),
599 e
600 );
601 }
602 }
603 }
604 }
605 }
606 }
607 Token::AcpiNotifyEvent => {
608 if let Some(gpe) = gpe {
609 if let Ok(val) = base::EventExt::read_count(&acpi_notify_evt) {
610 notification_val.lock().push(val as u32);
611 let request = VmRequest::Gpe(gpe);
612 if self.vm_socket.send(&request).is_ok() {
613 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
614 error!("{} failed to send GPE: {}", self.name.clone(), e);
615 }
616 }
617 } else {
618 error!("{} failed to read acpi_notify_evt", self.name.clone());
619 }
620 }
621 }
622 Token::Kill => break 'wait,
623 }
624 }
625 }
626 }
627 }
628
get_next_from_extcap_header(cap_header: u32) -> u32629 fn get_next_from_extcap_header(cap_header: u32) -> u32 {
630 (cap_header >> 20) & 0xffc
631 }
632
is_skipped_ext_cap(cap_id: u16) -> bool633 fn is_skipped_ext_cap(cap_id: u16) -> bool {
634 matches!(
635 cap_id,
636 // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
637 PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
638 )
639 }
640
641 enum DeviceData {
642 IntelGfxData { opregion_index: u32 },
643 }
644
645 /// PCI Express Extended Capabilities information
646 #[derive(Copy, Clone)]
647 struct ExtCap {
648 /// cap offset in Configuration Space
649 offset: u32,
650 /// cap size
651 size: u32,
652 /// next offset, set next non-skipped offset for non-skipped ext cap
653 next: u16,
654 /// whether to be exposed to guest
655 is_skipped: bool,
656 }
657
658 /// Implements the Vfio Pci device, then a pci device is added into vm
659 pub struct VfioPciDevice {
660 device: Arc<VfioDevice>,
661 config: VfioPciConfig,
662 hotplug: bool,
663 hotplug_bus_number: Option<u8>,
664 preferred_address: PciAddress,
665 pci_address: Option<PciAddress>,
666 interrupt_evt: Option<IrqLevelEvent>,
667 acpi_notification_evt: Option<Event>,
668 mmio_regions: Vec<PciBarConfiguration>,
669 io_regions: Vec<PciBarConfiguration>,
670 pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
671 msi_cap: Option<VfioMsiCap>,
672 msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
673 irq_type: Option<VfioIrqType>,
674 vm_memory_client: VmMemoryClient,
675 device_data: Option<DeviceData>,
676 pm_evt: Option<Event>,
677 is_in_low_power: Arc<Mutex<bool>>,
678 worker_thread: Option<WorkerThread<VfioPciWorker>>,
679 vm_socket_vm: Option<Tube>,
680 sysfs_path: PathBuf,
681 // PCI Express Extended Capabilities
682 ext_caps: Vec<ExtCap>,
683 vcfg_shm_mmap: Option<MemoryMapping>,
684 mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<VmMemoryRegionId>)>,
685 activated: bool,
686 acpi_notifier_val: Arc<Mutex<Vec<u32>>>,
687 gpe: Option<u32>,
688 base_class_code: PciClassCode,
689 }
690
691 impl VfioPciDevice {
692 /// Constructs a new Vfio Pci device for the give Vfio device
new( sysfs_path: &Path, device: VfioDevice, hotplug: bool, hotplug_bus_number: Option<u8>, guest_address: Option<PciAddress>, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vm_memory_client: VmMemoryClient, vfio_device_socket_vm: Tube, ) -> Result<Self, PciDeviceError>693 pub fn new(
694 sysfs_path: &Path,
695 device: VfioDevice,
696 hotplug: bool,
697 hotplug_bus_number: Option<u8>,
698 guest_address: Option<PciAddress>,
699 vfio_device_socket_msi: Tube,
700 vfio_device_socket_msix: Tube,
701 vm_memory_client: VmMemoryClient,
702 vfio_device_socket_vm: Tube,
703 ) -> Result<Self, PciDeviceError> {
704 let preferred_address = if let Some(bus_num) = hotplug_bus_number {
705 debug!("hotplug bus {}", bus_num);
706 PciAddress {
707 // Caller specify pcie bus number for hotplug device
708 bus: bus_num,
709 // devfn should be 0, otherwise pcie root port couldn't detect it
710 dev: 0,
711 func: 0,
712 }
713 } else if let Some(guest_address) = guest_address {
714 debug!("guest PCI address {}", guest_address);
715 guest_address
716 } else {
717 let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
718 PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
719 })?;
720 debug!("parsed device PCI address {}", addr);
721 addr
722 };
723
724 let dev = Arc::new(device);
725 let config = VfioPciConfig::new(Arc::clone(&dev));
726 let mut msi_socket = Some(vfio_device_socket_msi);
727 let mut msix_socket = Some(vfio_device_socket_msix);
728 let mut msi_cap: Option<VfioMsiCap> = None;
729 let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
730 let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
731
732 let mut is_pcie = false;
733 let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
734 let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
735 let device_id: u16 = config.read_config(PCI_DEVICE_ID);
736 let base_class_code = PciClassCode::try_from(config.read_config::<u8>(PCI_BASE_CLASS_CODE))
737 .unwrap_or(PciClassCode::Other);
738
739 let pci_id = PciId::new(vendor_id, device_id);
740
741 while cap_next != 0 {
742 let cap_id: u8 = config.read_config(cap_next);
743 if cap_id == PCI_CAP_ID_PM {
744 pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
745 } else if cap_id == PCI_CAP_ID_MSI {
746 if let Some(msi_socket) = msi_socket.take() {
747 msi_cap = Some(VfioMsiCap::new(
748 &config,
749 cap_next,
750 msi_socket,
751 pci_id.into(),
752 dev.device_name().to_string(),
753 ));
754 }
755 } else if cap_id == PCI_CAP_ID_MSIX {
756 if let Some(msix_socket) = msix_socket.take() {
757 msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
758 &config,
759 cap_next,
760 msix_socket,
761 pci_id.into(),
762 dev.device_name().to_string(),
763 ))));
764 }
765 } else if cap_id == PciCapabilityID::PciExpress as u8 {
766 is_pcie = true;
767 }
768 let offset = cap_next + PCI_MSI_NEXT_POINTER;
769 cap_next = config.read_config::<u8>(offset).into();
770 }
771
772 let mut ext_caps: Vec<ExtCap> = Vec::new();
773 if is_pcie {
774 let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
775 while ext_cap_next != 0 {
776 let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
777 if ext_cap_config == 0 {
778 break;
779 }
780 ext_caps.push(ExtCap {
781 offset: ext_cap_next,
782 // Calculate the size later
783 size: 0,
784 // init as the real value
785 next: get_next_from_extcap_header(ext_cap_config) as u16,
786 is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
787 });
788 ext_cap_next = get_next_from_extcap_header(ext_cap_config);
789 }
790
791 // Manage extended caps
792 //
793 // Extended capabilities are chained with each pointing to the next, so
794 // we can drop anything other than the head of the chain simply by
795 // modifying the previous next pointer. For the head of the chain, we
796 // can modify the capability ID to something that cannot match a valid
797 // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
798 // supported.
799 //
800 // reverse order by offset
801 ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
802 let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
803 let mut non_skipped_next: u16 = 0;
804 for ext_cap in ext_caps.iter_mut() {
805 if !ext_cap.is_skipped {
806 ext_cap.next = non_skipped_next;
807 non_skipped_next = ext_cap.offset as u16;
808 } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
809 ext_cap.next = non_skipped_next;
810 }
811 ext_cap.size = next_offset - ext_cap.offset;
812 next_offset = ext_cap.offset;
813 }
814 // order by offset
815 ext_caps.reverse();
816 }
817
818 let is_intel_gfx =
819 base_class_code == PciClassCode::DisplayController && vendor_id == PCI_VENDOR_ID_INTEL;
820 let device_data = if is_intel_gfx {
821 Some(DeviceData::IntelGfxData {
822 opregion_index: u32::max_value(),
823 })
824 } else {
825 None
826 };
827
828 Ok(VfioPciDevice {
829 device: dev,
830 config,
831 hotplug,
832 hotplug_bus_number,
833 preferred_address,
834 pci_address: None,
835 interrupt_evt: None,
836 acpi_notification_evt: None,
837 mmio_regions: Vec::new(),
838 io_regions: Vec::new(),
839 pm_cap,
840 msi_cap,
841 msix_cap,
842 irq_type: None,
843 vm_memory_client,
844 device_data,
845 pm_evt: None,
846 is_in_low_power: Arc::new(Mutex::new(false)),
847 worker_thread: None,
848 vm_socket_vm: Some(vfio_device_socket_vm),
849 sysfs_path: sysfs_path.to_path_buf(),
850 ext_caps,
851 vcfg_shm_mmap: None,
852 mapped_mmio_bars: BTreeMap::new(),
853 activated: false,
854 acpi_notifier_val: Arc::new(Mutex::new(Vec::new())),
855 gpe: None,
856 base_class_code,
857 })
858 }
859
860 /// Gets the pci address of the device, if one has already been allocated.
pci_address(&self) -> Option<PciAddress>861 pub fn pci_address(&self) -> Option<PciAddress> {
862 self.pci_address
863 }
864
is_gfx(&self) -> bool865 pub fn is_gfx(&self) -> bool {
866 self.base_class_code == PciClassCode::DisplayController
867 }
868
is_intel_gfx(&self) -> bool869 fn is_intel_gfx(&self) -> bool {
870 matches!(self.device_data, Some(DeviceData::IntelGfxData { .. }))
871 }
872
enable_acpi_notification(&mut self) -> Result<(), PciDeviceError>873 fn enable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
874 if let Some(ref acpi_notification_evt) = self.acpi_notification_evt {
875 return self
876 .device
877 .acpi_notification_evt_enable(acpi_notification_evt, VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
878 .map_err(|_| PciDeviceError::AcpiNotifySetupFailed);
879 }
880 Err(PciDeviceError::AcpiNotifySetupFailed)
881 }
882
883 #[allow(dead_code)]
disable_acpi_notification(&mut self) -> Result<(), PciDeviceError>884 fn disable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
885 if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
886 return self
887 .device
888 .acpi_notification_disable(VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
889 .map_err(|_| PciDeviceError::AcpiNotifyDeactivationFailed);
890 }
891 Err(PciDeviceError::AcpiNotifyDeactivationFailed)
892 }
893
894 #[allow(dead_code)]
test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError>895 fn test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError> {
896 if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
897 return self
898 .device
899 .acpi_notification_test(VFIO_PCI_ACPI_NTFY_IRQ_INDEX, val)
900 .map_err(|_| PciDeviceError::AcpiNotifyTestFailed);
901 }
902 Err(PciDeviceError::AcpiNotifyTestFailed)
903 }
904
enable_intx(&mut self)905 fn enable_intx(&mut self) {
906 if let Some(ref interrupt_evt) = self.interrupt_evt {
907 if let Err(e) = self.device.irq_enable(
908 &[Some(interrupt_evt.get_trigger())],
909 VFIO_PCI_INTX_IRQ_INDEX,
910 0,
911 ) {
912 error!("{} Intx enable failed: {}", self.debug_label(), e);
913 return;
914 }
915 if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
916 error!("{} Intx mask failed: {}", self.debug_label(), e);
917 self.disable_intx();
918 return;
919 }
920 if let Err(e) = self
921 .device
922 .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
923 {
924 error!("{} resample enable failed: {}", self.debug_label(), e);
925 self.disable_intx();
926 return;
927 }
928 if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
929 error!("{} Intx unmask failed: {}", self.debug_label(), e);
930 self.disable_intx();
931 return;
932 }
933 self.irq_type = Some(VfioIrqType::Intx);
934 }
935 }
936
disable_intx(&mut self)937 fn disable_intx(&mut self) {
938 if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
939 error!("{} Intx disable failed: {}", self.debug_label(), e);
940 }
941 self.irq_type = None;
942 }
943
disable_irqs(&mut self)944 fn disable_irqs(&mut self) {
945 match self.irq_type {
946 Some(VfioIrqType::Msi) => self.disable_msi(),
947 Some(VfioIrqType::Msix) => self.disable_msix(),
948 _ => (),
949 }
950
951 // Above disable_msi() or disable_msix() will enable intx again.
952 // so disable_intx here again.
953 if let Some(VfioIrqType::Intx) = self.irq_type {
954 self.disable_intx();
955 }
956 }
957
enable_msi(&mut self)958 fn enable_msi(&mut self) {
959 self.disable_irqs();
960
961 let irqfd = match &self.msi_cap {
962 Some(cap) => {
963 if let Some(fd) = cap.get_msi_irqfd() {
964 fd
965 } else {
966 self.enable_intx();
967 return;
968 }
969 }
970 None => {
971 self.enable_intx();
972 return;
973 }
974 };
975
976 if let Err(e) = self
977 .device
978 .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
979 {
980 error!("{} failed to enable msi: {}", self.debug_label(), e);
981 self.enable_intx();
982 return;
983 }
984
985 self.irq_type = Some(VfioIrqType::Msi);
986 }
987
disable_msi(&mut self)988 fn disable_msi(&mut self) {
989 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
990 error!("{} failed to disable msi: {}", self.debug_label(), e);
991 return;
992 }
993 self.irq_type = None;
994
995 self.enable_intx();
996 }
997
enable_msix(&mut self)998 fn enable_msix(&mut self) {
999 if self.msix_cap.is_none() {
1000 return;
1001 }
1002
1003 self.disable_irqs();
1004 let cap = self.msix_cap.as_ref().unwrap().lock();
1005 let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1006
1007 let mut failed = false;
1008 if !vector_in_use {
1009 // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1010 // to vector 0. Then we enable it and immediately disable it, so that vfio will
1011 // activate physical device. If there are available msix vectors, just enable them
1012 // instead.
1013 let fd = Event::new().expect("failed to create event");
1014 let table_size = cap.table_size();
1015 let mut irqfds = vec![None; table_size];
1016 irqfds[0] = Some(&fd);
1017 for fd in irqfds.iter_mut().skip(1) {
1018 *fd = None;
1019 }
1020 if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1021 error!("{} failed to enable msix: {}", self.debug_label(), e);
1022 failed = true;
1023 }
1024 irqfds[0] = None;
1025 if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1026 error!("{} failed to enable msix: {}", self.debug_label(), e);
1027 failed = true;
1028 }
1029 } else {
1030 let result = self
1031 .device
1032 .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1033 if let Err(e) = result {
1034 error!("{} failed to enable msix: {}", self.debug_label(), e);
1035 failed = true;
1036 }
1037 }
1038
1039 std::mem::drop(cap);
1040 if failed {
1041 self.enable_intx();
1042 return;
1043 }
1044 self.irq_type = Some(VfioIrqType::Msix);
1045 }
1046
disable_msix(&mut self)1047 fn disable_msix(&mut self) {
1048 if self.msix_cap.is_none() {
1049 return;
1050 }
1051 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1052 error!("{} failed to disable msix: {}", self.debug_label(), e);
1053 return;
1054 }
1055 self.irq_type = None;
1056 self.enable_intx();
1057 }
1058
msix_vectors_update(&self) -> Result<(), VfioError>1059 fn msix_vectors_update(&self) -> Result<(), VfioError> {
1060 if let Some(cap) = &self.msix_cap {
1061 self.device
1062 .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1063 }
1064 Ok(())
1065 }
1066
msix_vector_update(&self, index: usize, irqfd: Option<&Event>)1067 fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1068 if let Err(e) = self
1069 .device
1070 .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1071 {
1072 error!(
1073 "{} failed to update msix vector {}: {}",
1074 self.debug_label(),
1075 index,
1076 e
1077 );
1078 }
1079 }
1080
adjust_bar_mmap( &self, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, remove_mmaps: &[AddressRange], ) -> Vec<vfio_region_sparse_mmap_area>1081 fn adjust_bar_mmap(
1082 &self,
1083 bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1084 remove_mmaps: &[AddressRange],
1085 ) -> Vec<vfio_region_sparse_mmap_area> {
1086 let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1087 let pgmask = (pagesize() as u64) - 1;
1088
1089 for mmap in bar_mmaps.iter() {
1090 let mmap_range = if let Some(mmap_range) =
1091 AddressRange::from_start_and_size(mmap.offset, mmap.size)
1092 {
1093 mmap_range
1094 } else {
1095 continue;
1096 };
1097 let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1098 Ok(a) => a,
1099 Err(e) => {
1100 error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1101 mmaps.clear();
1102 return mmaps;
1103 }
1104 };
1105
1106 for &(mut remove_range) in remove_mmaps.iter() {
1107 remove_range = remove_range.intersect(mmap_range);
1108 if !remove_range.is_empty() {
1109 // align offsets to page size
1110 let begin = remove_range.start & !pgmask;
1111 let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1112 let remove_range = AddressRange::from_start_and_end(begin, end);
1113 if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1114 error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1115 }
1116 }
1117 }
1118
1119 for mmap in to_mmap.regions {
1120 mmaps.push(vfio_region_sparse_mmap_area {
1121 offset: mmap.start,
1122 size: mmap.end - mmap.start + 1,
1123 });
1124 }
1125 }
1126
1127 mmaps
1128 }
1129
remove_bar_mmap_msix( &self, bar_index: PciBarIndex, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1130 fn remove_bar_mmap_msix(
1131 &self,
1132 bar_index: PciBarIndex,
1133 bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1134 ) -> Vec<vfio_region_sparse_mmap_area> {
1135 let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1136 let mut msix_regions = Vec::new();
1137
1138 if let Some(t) = msix_cap.get_msix_table(bar_index) {
1139 msix_regions.push(t);
1140 }
1141 if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1142 msix_regions.push(p);
1143 }
1144
1145 if msix_regions.is_empty() {
1146 return bar_mmaps;
1147 }
1148
1149 self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1150 }
1151
add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId>1152 fn add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId> {
1153 let mut mmaps_ids: Vec<VmMemoryRegionId> = Vec::new();
1154 if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1155 // the bar storing msix table and pba couldn't mmap.
1156 // these bars should be trapped, so that msix could be emulated.
1157 let mut mmaps = self.device.get_region_mmap(index);
1158
1159 if self.msix_cap.is_some() {
1160 mmaps = self.remove_bar_mmap_msix(index, mmaps);
1161 }
1162 if mmaps.is_empty() {
1163 return mmaps_ids;
1164 }
1165
1166 for mmap in mmaps.iter() {
1167 let mmap_offset = mmap.offset;
1168 let mmap_size = mmap.size;
1169 let guest_map_start = bar_addr + mmap_offset;
1170 let region_offset = self.device.get_region_offset(index);
1171 let offset = region_offset + mmap_offset;
1172 let descriptor = match self.device.device_file().try_clone() {
1173 Ok(device_file) => device_file.into(),
1174 Err(_) => break,
1175 };
1176 match self.vm_memory_client.register_memory(
1177 VmMemorySource::Descriptor {
1178 descriptor,
1179 offset,
1180 size: mmap_size,
1181 },
1182 VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1183 Protection::read_write(),
1184 MemCacheType::CacheCoherent,
1185 ) {
1186 Ok(id) => {
1187 mmaps_ids.push(id);
1188 }
1189 Err(e) => {
1190 error!("register_memory failed: {}", e);
1191 break;
1192 }
1193 }
1194 }
1195 }
1196
1197 mmaps_ids
1198 }
1199
remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId])1200 fn remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId]) {
1201 for mmap_id in mmap_ids {
1202 if let Err(e) = self.vm_memory_client.unregister_memory(*mmap_id) {
1203 error!("unregister_memory failed: {}", e);
1204 }
1205 }
1206 }
1207
disable_bars_mmap(&mut self)1208 fn disable_bars_mmap(&mut self) {
1209 for (_, (_, mmap_ids)) in self.mapped_mmio_bars.iter() {
1210 self.remove_bar_mmap(mmap_ids);
1211 }
1212 self.mapped_mmio_bars.clear();
1213 }
1214
commit_bars_mmap(&mut self)1215 fn commit_bars_mmap(&mut self) {
1216 // Unmap all bars before remapping bars, to prevent issues with overlap
1217 let mut needs_map = Vec::new();
1218 for mmio_info in self.mmio_regions.iter() {
1219 let bar_idx = mmio_info.bar_index();
1220 let addr = mmio_info.address();
1221
1222 if let Some((cur_addr, ids)) = self.mapped_mmio_bars.remove(&bar_idx) {
1223 if cur_addr == addr {
1224 self.mapped_mmio_bars.insert(bar_idx, (cur_addr, ids));
1225 continue;
1226 } else {
1227 self.remove_bar_mmap(&ids);
1228 }
1229 }
1230
1231 if addr != 0 {
1232 needs_map.push((bar_idx, addr));
1233 }
1234 }
1235
1236 for (bar_idx, addr) in needs_map.iter() {
1237 let ids = self.add_bar_mmap(*bar_idx, *addr);
1238 self.mapped_mmio_bars.insert(*bar_idx, (*addr, ids));
1239 }
1240 }
1241
close(&mut self)1242 fn close(&mut self) {
1243 if let Some(msi) = self.msi_cap.as_mut() {
1244 msi.destroy();
1245 }
1246 if let Some(msix) = &self.msix_cap {
1247 msix.lock().destroy();
1248 }
1249 self.disable_bars_mmap();
1250 self.device.close();
1251 }
1252
start_work_thread(&mut self)1253 fn start_work_thread(&mut self) {
1254 let vm_socket = match self.vm_socket_vm.take() {
1255 Some(socket) => socket,
1256 None => return,
1257 };
1258
1259 let req_evt = match Event::new() {
1260 Ok(evt) => {
1261 if let Err(e) = self
1262 .device
1263 .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1264 {
1265 error!("{} enable req_irq failed: {}", self.debug_label(), e);
1266 return;
1267 }
1268 evt
1269 }
1270 Err(_) => return,
1271 };
1272
1273 let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1274 Ok(v) => v,
1275 Err(e) => {
1276 error!(
1277 "{} failed creating PM Event pair: {}",
1278 self.debug_label(),
1279 e
1280 );
1281 return;
1282 }
1283 };
1284 self.pm_evt = Some(self_pm_evt);
1285
1286 let (self_acpi_notify_evt, acpi_notify_evt) =
1287 match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1288 Ok(v) => v,
1289 Err(e) => {
1290 error!(
1291 "{} failed creating ACPI Event pair: {}",
1292 self.debug_label(),
1293 e
1294 );
1295 return;
1296 }
1297 };
1298 self.acpi_notification_evt = Some(self_acpi_notify_evt);
1299
1300 if let Err(e) = self.enable_acpi_notification() {
1301 error!("{}: {}", self.debug_label(), e);
1302 }
1303
1304 let mut msix_evt = Vec::new();
1305 if let Some(msix_cap) = &self.msix_cap {
1306 msix_evt = msix_cap.lock().clone_msix_evt();
1307 }
1308
1309 let name = self.device.device_name().to_string();
1310 let address = self.pci_address.expect("Unassigned PCI Address.");
1311 let sysfs_path = self.sysfs_path.clone();
1312 let pm_cap = self.pm_cap.clone();
1313 let msix_cap = self.msix_cap.clone();
1314 let is_in_low_power = self.is_in_low_power.clone();
1315 let gpe_nr = self.gpe;
1316 let notification_val = self.acpi_notifier_val.clone();
1317 self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1318 let mut worker = VfioPciWorker {
1319 address,
1320 sysfs_path,
1321 vm_socket,
1322 name,
1323 pm_cap,
1324 msix_cap,
1325 };
1326 worker.run(
1327 req_evt,
1328 pm_evt,
1329 acpi_notify_evt,
1330 kill_evt,
1331 msix_evt,
1332 is_in_low_power,
1333 gpe_nr,
1334 notification_val,
1335 );
1336 worker
1337 }));
1338 self.activated = true;
1339 }
1340
collect_bars(&mut self) -> Vec<PciBarConfiguration>1341 fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1342 let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1343 let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1344
1345 while i <= VFIO_PCI_ROM_REGION_INDEX {
1346 let mut low: u32 = 0xffffffff;
1347 let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1348 0x30
1349 } else {
1350 0x10 + i * 4
1351 };
1352 self.config.write_config(low, offset);
1353 low = self.config.read_config(offset);
1354
1355 let low_flag = low & 0xf;
1356 let is_64bit = low_flag & 0x4 == 0x4;
1357 if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1358 let mut upper: u32 = 0xffffffff;
1359 if is_64bit {
1360 self.config.write_config(upper, offset + 4);
1361 upper = self.config.read_config(offset + 4);
1362 }
1363
1364 low &= 0xffff_fff0;
1365 let mut size: u64 = u64::from(upper);
1366 size <<= 32;
1367 size |= u64::from(low);
1368 size = !size + 1;
1369 let region_type = if is_64bit {
1370 PciBarRegionType::Memory64BitRegion
1371 } else {
1372 PciBarRegionType::Memory32BitRegion
1373 };
1374 let prefetch = if low_flag & 0x8 == 0x8 {
1375 PciBarPrefetchable::Prefetchable
1376 } else {
1377 PciBarPrefetchable::NotPrefetchable
1378 };
1379 mem_bars.push(PciBarConfiguration::new(
1380 i as usize,
1381 size,
1382 region_type,
1383 prefetch,
1384 ));
1385 } else if low_flag & 0x1 == 0x1 {
1386 let size = !(low & 0xffff_fffc) + 1;
1387 self.io_regions.push(PciBarConfiguration::new(
1388 i as usize,
1389 size.into(),
1390 PciBarRegionType::IoRegion,
1391 PciBarPrefetchable::NotPrefetchable,
1392 ));
1393 }
1394
1395 if is_64bit {
1396 i += 2;
1397 } else {
1398 i += 1;
1399 }
1400 }
1401 mem_bars
1402 }
1403
configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64)1404 fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1405 let offset: u32 = bar_info.reg_index() as u32 * 4;
1406 let mmio_region = *bar_info;
1407 self.mmio_regions.push(mmio_region.set_address(bar_addr));
1408
1409 let val: u32 = self.config.read_config(offset);
1410 let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1411 self.config.write_config(low, offset);
1412 if bar_info.is_64bit_memory() {
1413 let upper = (bar_addr >> 32) as u32;
1414 self.config.write_config(upper, offset + 4);
1415 }
1416 }
1417
allocate_root_barmem( &mut self, mem_bars: &[PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1418 fn allocate_root_barmem(
1419 &mut self,
1420 mem_bars: &[PciBarConfiguration],
1421 resources: &mut SystemAllocator,
1422 ) -> Result<Vec<BarRange>, PciDeviceError> {
1423 let address = self.pci_address.unwrap();
1424 let mut ranges: Vec<BarRange> = Vec::new();
1425 for mem_bar in mem_bars {
1426 let bar_size = mem_bar.size();
1427 let mut bar_addr: u64 = 0;
1428 // Don't allocate mmio for hotplug device, OS will allocate it from
1429 // its parent's bridge window.
1430 if !self.hotplug {
1431 bar_addr = resources
1432 .allocate_mmio(
1433 bar_size,
1434 Alloc::PciBar {
1435 bus: address.bus,
1436 dev: address.dev,
1437 func: address.func,
1438 bar: mem_bar.bar_index() as u8,
1439 },
1440 "vfio_bar".to_string(),
1441 AllocOptions::new()
1442 .prefetchable(mem_bar.is_prefetchable())
1443 .max_address(if mem_bar.is_64bit_memory() {
1444 u64::MAX
1445 } else {
1446 u32::MAX.into()
1447 })
1448 .align(bar_size),
1449 )
1450 .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1451 ranges.push(BarRange {
1452 addr: bar_addr,
1453 size: bar_size,
1454 prefetchable: mem_bar.is_prefetchable(),
1455 });
1456 }
1457 self.configure_barmem(mem_bar, bar_addr);
1458 }
1459 Ok(ranges)
1460 }
1461
allocate_nonroot_barmem( &mut self, mem_bars: &mut [PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1462 fn allocate_nonroot_barmem(
1463 &mut self,
1464 mem_bars: &mut [PciBarConfiguration],
1465 resources: &mut SystemAllocator,
1466 ) -> Result<Vec<BarRange>, PciDeviceError> {
1467 const NON_PREFETCHABLE: usize = 0;
1468 const PREFETCHABLE: usize = 1;
1469 const ARRAY_SIZE: usize = 2;
1470 let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1471 let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1472 match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1473 Ok(a) => a,
1474 Err(e) => {
1475 error!(
1476 "{} init nonroot VfioResourceAllocator failed: {}",
1477 self.debug_label(),
1478 e
1479 );
1480 return Err(e);
1481 }
1482 },
1483 match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1484 Ok(a) => a,
1485 Err(e) => {
1486 error!(
1487 "{} init nonroot VfioResourceAllocator failed: {}",
1488 self.debug_label(),
1489 e
1490 );
1491 return Err(e);
1492 }
1493 },
1494 ];
1495 let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1496 // the window must be 1M-aligned as per the PCI spec
1497 let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1498 let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1499
1500 // Descend by bar size, this could reduce allocated size for all the bars.
1501 mem_bars.sort_by_key(|a| Reverse(a.size()));
1502 for mem_bar in mem_bars {
1503 let prefetchable = mem_bar.is_prefetchable();
1504 let is_64bit = mem_bar.is_64bit_memory();
1505
1506 // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1507 // as all the prefetchable bars should be in one region
1508 if prefetchable && !is_64bit {
1509 memtype[PREFETCHABLE] = MmioType::Low;
1510 }
1511 let i = if prefetchable {
1512 PREFETCHABLE
1513 } else {
1514 NON_PREFETCHABLE
1515 };
1516 let bar_size = mem_bar.size();
1517 let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1518 Ok(s) => s,
1519 Err(e) => {
1520 error!(
1521 "{} nonroot allocate_wit_align failed: {}",
1522 self.debug_label(),
1523 e
1524 );
1525 return Err(e);
1526 }
1527 };
1528 window_sz[i] = max(window_sz[i], start + bar_size);
1529 alignment[i] = max(alignment[i], bar_size);
1530 let mem_info = (*mem_bar).set_address(start);
1531 membars[i].push(mem_info);
1532 }
1533
1534 let address = self.pci_address.unwrap();
1535 let mut ranges: Vec<BarRange> = Vec::new();
1536 for (index, bars) in membars.iter().enumerate() {
1537 if bars.is_empty() {
1538 continue;
1539 }
1540
1541 let i = if index == 1 {
1542 PREFETCHABLE
1543 } else {
1544 NON_PREFETCHABLE
1545 };
1546 let mut window_addr: u64 = 0;
1547 // Don't allocate mmio for hotplug device, OS will allocate it from
1548 // its parent's bridge window.
1549 if !self.hotplug {
1550 window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1551 let alloc = if i == NON_PREFETCHABLE {
1552 Alloc::PciBridgeWindow {
1553 bus: address.bus,
1554 dev: address.dev,
1555 func: address.func,
1556 }
1557 } else {
1558 Alloc::PciBridgePrefetchWindow {
1559 bus: address.bus,
1560 dev: address.dev,
1561 func: address.func,
1562 }
1563 };
1564 window_addr = resources
1565 .mmio_allocator(memtype[i])
1566 .allocate_with_align(
1567 window_sz[i],
1568 alloc,
1569 "vfio_bar_window".to_string(),
1570 alignment[i],
1571 )
1572 .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1573 for mem_info in bars {
1574 let bar_addr = window_addr + mem_info.address();
1575 ranges.push(BarRange {
1576 addr: bar_addr,
1577 size: mem_info.size(),
1578 prefetchable: mem_info.is_prefetchable(),
1579 });
1580 }
1581 }
1582
1583 for mem_info in bars {
1584 let bar_addr = window_addr + mem_info.address();
1585 self.configure_barmem(mem_info, bar_addr);
1586 }
1587 }
1588 Ok(ranges)
1589 }
1590
1591 /// Return the supported iova max address of the Vfio Pci device
get_max_iova(&self) -> u641592 pub fn get_max_iova(&self) -> u64 {
1593 self.device.get_max_addr()
1594 }
1595
get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap>1596 fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1597 self.ext_caps
1598 .iter()
1599 .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1600 .cloned()
1601 }
1602
is_skipped_reg(&self, reg: u32) -> bool1603 fn is_skipped_reg(&self, reg: u32) -> bool {
1604 // fast handle for pci config space
1605 if reg < PCI_CONFIG_SPACE_SIZE {
1606 return false;
1607 }
1608
1609 self.get_ext_cap_by_reg(reg)
1610 .map_or(false, |cap| cap.is_skipped)
1611 }
1612 }
1613
1614 impl PciDevice for VfioPciDevice {
debug_label(&self) -> String1615 fn debug_label(&self) -> String {
1616 format!("vfio {} device", self.device.device_name())
1617 }
1618
preferred_address(&self) -> Option<PciAddress>1619 fn preferred_address(&self) -> Option<PciAddress> {
1620 Some(self.preferred_address)
1621 }
1622
allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>1623 fn allocate_address(
1624 &mut self,
1625 resources: &mut SystemAllocator,
1626 ) -> Result<PciAddress, PciDeviceError> {
1627 if self.pci_address.is_none() {
1628 let mut address = self.preferred_address;
1629 while address.func < 8 {
1630 if resources.reserve_pci(
1631 Alloc::PciBar {
1632 bus: address.bus,
1633 dev: address.dev,
1634 func: address.func,
1635 bar: 0,
1636 },
1637 self.debug_label(),
1638 ) {
1639 self.pci_address = Some(address);
1640 break;
1641 } else if self.hotplug_bus_number.is_none() {
1642 break;
1643 } else {
1644 address.func += 1;
1645 }
1646 }
1647 }
1648 self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1649 }
1650
keep_rds(&self) -> Vec<RawDescriptor>1651 fn keep_rds(&self) -> Vec<RawDescriptor> {
1652 let mut rds = self.device.keep_rds();
1653 if let Some(ref interrupt_evt) = self.interrupt_evt {
1654 rds.extend(interrupt_evt.as_raw_descriptors());
1655 }
1656 rds.push(self.vm_memory_client.as_raw_descriptor());
1657 if let Some(vm_socket_vm) = &self.vm_socket_vm {
1658 rds.push(vm_socket_vm.as_raw_descriptor());
1659 }
1660 if let Some(msi_cap) = &self.msi_cap {
1661 rds.push(msi_cap.config.get_msi_socket());
1662 }
1663 if let Some(msix_cap) = &self.msix_cap {
1664 rds.push(msix_cap.lock().config.as_raw_descriptor());
1665 }
1666 rds
1667 }
1668
preferred_irq(&self) -> PreferredIrq1669 fn preferred_irq(&self) -> PreferredIrq {
1670 // Is INTx configured?
1671 let pin = match self.config.read_config::<u8>(PCI_INTERRUPT_PIN) {
1672 1 => PciInterruptPin::IntA,
1673 2 => PciInterruptPin::IntB,
1674 3 => PciInterruptPin::IntC,
1675 4 => PciInterruptPin::IntD,
1676 _ => return PreferredIrq::None,
1677 };
1678
1679 // TODO: replace sysfs/irq value parsing with vfio interface
1680 // reporting host allocated interrupt number and type.
1681 let path = self.sysfs_path.join("irq");
1682 let gsi = fs::read_to_string(path)
1683 .map(|v| v.trim().parse::<u32>().unwrap_or(0))
1684 .unwrap_or(0);
1685
1686 PreferredIrq::Fixed { pin, gsi }
1687 }
1688
assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32)1689 fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1690 // Keep event/resample event references.
1691 self.interrupt_evt = Some(irq_evt);
1692
1693 // enable INTX
1694 self.enable_intx();
1695
1696 self.config
1697 .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1698 self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1699 }
1700
allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1701 fn allocate_io_bars(
1702 &mut self,
1703 resources: &mut SystemAllocator,
1704 ) -> Result<Vec<BarRange>, PciDeviceError> {
1705 let address = self
1706 .pci_address
1707 .expect("allocate_address must be called prior to allocate_device_bars");
1708
1709 let mut mem_bars = self.collect_bars();
1710
1711 let ranges = if address.bus == 0 {
1712 self.allocate_root_barmem(&mem_bars, resources)?
1713 } else {
1714 self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1715 };
1716
1717 // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1718 // driver doesn't claim this vga device, then xorg couldn't boot up.
1719 if self.is_intel_gfx() {
1720 let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1721 cmd |= PCI_COMMAND_MEMORY;
1722 self.config.write_config(cmd, PCI_COMMAND);
1723 }
1724 Ok(ranges)
1725 }
1726
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1727 fn allocate_device_bars(
1728 &mut self,
1729 resources: &mut SystemAllocator,
1730 ) -> Result<Vec<BarRange>, PciDeviceError> {
1731 let mut ranges: Vec<BarRange> = Vec::new();
1732
1733 if !self.is_intel_gfx() {
1734 return Ok(ranges);
1735 }
1736
1737 // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1738 // then write this gpa into pci cfg register
1739 if let Some((index, size)) = self.device.get_cap_type_info(
1740 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1741 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1742 ) {
1743 let address = self
1744 .pci_address
1745 .expect("allocate_address must be called prior to allocate_device_bars");
1746 let bar_addr = resources
1747 .allocate_mmio(
1748 size,
1749 Alloc::PciBar {
1750 bus: address.bus,
1751 dev: address.dev,
1752 func: address.func,
1753 bar: (index * 4) as u8,
1754 },
1755 "vfio_bar".to_string(),
1756 AllocOptions::new().max_address(u32::MAX.into()),
1757 )
1758 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1759 ranges.push(BarRange {
1760 addr: bar_addr,
1761 size,
1762 prefetchable: false,
1763 });
1764 self.device_data = Some(DeviceData::IntelGfxData {
1765 opregion_index: index,
1766 });
1767
1768 self.mmio_regions.push(
1769 PciBarConfiguration::new(
1770 index as usize,
1771 size,
1772 PciBarRegionType::Memory32BitRegion,
1773 PciBarPrefetchable::NotPrefetchable,
1774 )
1775 .set_address(bar_addr),
1776 );
1777 self.config.write_config(bar_addr as u32, 0xFC);
1778 }
1779
1780 Ok(ranges)
1781 }
1782
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1783 fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1784 for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1785 if region.bar_index() == bar_num {
1786 let command: u8 = self.config.read_config(PCI_COMMAND);
1787 if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1788 return None;
1789 } else {
1790 return Some(*region);
1791 }
1792 }
1793 }
1794
1795 None
1796 }
1797
register_device_capabilities(&mut self) -> Result<(), PciDeviceError>1798 fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1799 Ok(())
1800 }
1801
read_config_register(&self, reg_idx: usize) -> u321802 fn read_config_register(&self, reg_idx: usize) -> u32 {
1803 let reg: u32 = (reg_idx * 4) as u32;
1804 let mut config: u32 = self.config.read_config(reg);
1805
1806 // See VfioPciDevice::new for details how extended caps are managed
1807 if reg >= PCI_CONFIG_SPACE_SIZE {
1808 let ext_cap = self.get_ext_cap_by_reg(reg);
1809 if let Some(ext_cap) = ext_cap {
1810 if ext_cap.offset == reg {
1811 config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1812 }
1813
1814 if ext_cap.is_skipped {
1815 if reg == PCI_CONFIG_SPACE_SIZE {
1816 config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1817 } else {
1818 config = 0;
1819 }
1820 }
1821 }
1822 }
1823
1824 // Ignore IO bar
1825 if (0x10..=0x24).contains(®) {
1826 let bar_idx = (reg as usize - 0x10) / 4;
1827 if let Some(bar) = self.get_bar_configuration(bar_idx) {
1828 if bar.is_io() {
1829 config = 0;
1830 }
1831 }
1832 } else if let Some(msix_cap) = &self.msix_cap {
1833 let msix_cap = msix_cap.lock();
1834 if msix_cap.is_msix_control_reg(reg, 4) {
1835 msix_cap.read_msix_control(&mut config);
1836 }
1837 } else if let Some(pm_cap) = &self.pm_cap {
1838 let pm_cap = pm_cap.lock();
1839 if pm_cap.is_pm_reg(reg) {
1840 config = pm_cap.read(reg);
1841 }
1842 }
1843
1844 // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1845 if self.is_intel_gfx() && reg == 0x50 {
1846 config &= 0xffff00ff;
1847 }
1848
1849 config
1850 }
1851
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1852 fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1853 // When guest write config register at the first time, start worker thread
1854 if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1855 self.start_work_thread();
1856 };
1857
1858 let start = (reg_idx * 4) as u64 + offset;
1859
1860 if let Some(pm_cap) = self.pm_cap.as_mut() {
1861 let mut pm_cap = pm_cap.lock();
1862 if pm_cap.is_pm_reg(start as u32) {
1863 pm_cap.write(start, data);
1864 }
1865 }
1866
1867 let mut msi_change: Option<VfioMsiChange> = None;
1868 if let Some(msi_cap) = self.msi_cap.as_mut() {
1869 if msi_cap.is_msi_reg(start, data.len()) {
1870 msi_change = msi_cap.write_msi_reg(start, data);
1871 }
1872 }
1873
1874 match msi_change {
1875 Some(VfioMsiChange::Enable) => self.enable_msi(),
1876 Some(VfioMsiChange::Disable) => self.disable_msi(),
1877 _ => (),
1878 }
1879
1880 msi_change = None;
1881 if let Some(msix_cap) = &self.msix_cap {
1882 let mut msix_cap = msix_cap.lock();
1883 if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
1884 msi_change = msix_cap.write_msix_control(data);
1885 }
1886 }
1887
1888 match msi_change {
1889 Some(VfioMsiChange::Enable) => self.enable_msix(),
1890 Some(VfioMsiChange::Disable) => self.disable_msix(),
1891 Some(VfioMsiChange::FunctionChanged) => {
1892 if let Err(e) = self.msix_vectors_update() {
1893 error!("update msix vectors failed: {}", e);
1894 }
1895 }
1896 _ => (),
1897 }
1898
1899 if !self.is_skipped_reg(start as u32) {
1900 self.device
1901 .region_write(VFIO_PCI_CONFIG_REGION_INDEX as usize, data, start);
1902 }
1903
1904 // if guest enable memory access, then enable bar mappable once
1905 if start == PCI_COMMAND as u64
1906 && data.len() == 2
1907 && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
1908 {
1909 self.commit_bars_mmap();
1910 } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
1911 let bar_idx = (start as u32 - 0x10) / 4;
1912 let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
1913 let val = u32::from_le_bytes(value);
1914 let mut modify = false;
1915 for region in self.mmio_regions.iter_mut() {
1916 if region.bar_index() == bar_idx as usize {
1917 let old_addr = region.address();
1918 let new_addr = val & 0xFFFFFFF0;
1919 if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
1920 // Change 32bit bar address
1921 *region = region.set_address(u64::from(new_addr));
1922 modify = true;
1923 } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
1924 // Change 64bit bar low address
1925 *region =
1926 region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
1927 modify = true;
1928 }
1929 break;
1930 } else if region.is_64bit_memory()
1931 && ((bar_idx % 2) == 1)
1932 && (region.bar_index() + 1 == bar_idx as usize)
1933 {
1934 // Change 64bit bar high address
1935 let old_addr = region.address();
1936 if val != (old_addr >> 32) as u32 {
1937 let mut new_addr = (u64::from(val)) << 32;
1938 new_addr |= old_addr & 0xFFFFFFFF;
1939 *region = region.set_address(new_addr);
1940 modify = true;
1941 }
1942 break;
1943 }
1944 }
1945 if modify {
1946 // if bar is changed under memory enabled, mmap the
1947 // new bar immediately.
1948 let cmd = self.config.read_config::<u8>(PCI_COMMAND);
1949 if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
1950 self.commit_bars_mmap();
1951 }
1952 }
1953 }
1954 }
1955
read_virtual_config_register(&self, reg_idx: usize) -> u321956 fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
1957 if reg_idx == PCI_VCFG_NOTY {
1958 let mut q = self.acpi_notifier_val.lock();
1959 let mut val = 0;
1960 if !q.is_empty() {
1961 val = q.remove(0);
1962 }
1963 drop(q);
1964 return val;
1965 }
1966
1967 warn!(
1968 "{} read unsupported vcfg register {}",
1969 self.debug_label(),
1970 reg_idx
1971 );
1972 0xFFFF_FFFF
1973 }
1974
write_virtual_config_register(&mut self, reg_idx: usize, value: u32)1975 fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
1976 match reg_idx {
1977 PCI_VCFG_PM => {
1978 match value {
1979 0 => {
1980 if let Some(pm_evt) =
1981 self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
1982 {
1983 *self.is_in_low_power.lock() = true;
1984 let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
1985 } else {
1986 let _ = self.device.pm_low_power_enter();
1987 }
1988 }
1989 _ => {
1990 *self.is_in_low_power.lock() = false;
1991 let _ = self.device.pm_low_power_exit();
1992 }
1993 };
1994 }
1995 PCI_VCFG_DSM => {
1996 if let Some(shm) = &self.vcfg_shm_mmap {
1997 let mut args = [0u8; 4096];
1998 if let Err(e) = shm.read_slice(&mut args, 0) {
1999 error!("failed to read DSM Args: {}", e);
2000 return;
2001 }
2002 let res = match self.device.acpi_dsm(&args) {
2003 Ok(r) => r,
2004 Err(e) => {
2005 error!("failed to call DSM: {}", e);
2006 return;
2007 }
2008 };
2009 if let Err(e) = shm.write_slice(&res, 0) {
2010 error!("failed to write DSM result: {}", e);
2011 return;
2012 }
2013 if let Err(e) = shm.msync() {
2014 error!("failed to msync: {}", e)
2015 }
2016 }
2017 }
2018 _ => warn!(
2019 "{} write unsupported vcfg register {}",
2020 self.debug_label(),
2021 reg_idx
2022 ),
2023 };
2024 }
2025
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])2026 fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
2027 if let Some(msix_cap) = &self.msix_cap {
2028 let msix_cap = msix_cap.lock();
2029 if msix_cap.is_msix_table(bar_index, offset) {
2030 msix_cap.read_table(offset, data);
2031 return;
2032 } else if msix_cap.is_msix_pba(bar_index, offset) {
2033 msix_cap.read_pba(offset, data);
2034 return;
2035 }
2036 }
2037 self.device.region_read(bar_index, data, offset);
2038 }
2039
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])2040 fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
2041 // Ignore igd opregion's write
2042 if let Some(device_data) = &self.device_data {
2043 match *device_data {
2044 DeviceData::IntelGfxData { opregion_index } => {
2045 if opregion_index == bar_index as u32 {
2046 return;
2047 }
2048 }
2049 }
2050 }
2051
2052 if let Some(msix_cap) = &self.msix_cap {
2053 let mut msix_cap = msix_cap.lock();
2054 if msix_cap.is_msix_table(bar_index, offset) {
2055 let behavior = msix_cap.write_table(offset, data);
2056 if let MsixStatus::EntryChanged(index) = behavior {
2057 let irqfd = msix_cap.get_msix_irqfd(index);
2058 self.msix_vector_update(index, irqfd);
2059 }
2060 return;
2061 } else if msix_cap.is_msix_pba(bar_index, offset) {
2062 msix_cap.write_pba(offset, data);
2063 return;
2064 }
2065 }
2066
2067 self.device.region_write(bar_index, data, offset);
2068 }
2069
destroy_device(&mut self)2070 fn destroy_device(&mut self) {
2071 self.close();
2072 }
2073
generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>)2074 fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2075 let mut amls = Vec::new();
2076 let mut shm = None;
2077 if let Some(pci_address) = self.pci_address {
2078 let vcfg_offset = pci_address.to_config_address(0, 13);
2079 if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2080 vcfg_register.to_aml_bytes(&mut amls);
2081 shm = vcfg_register
2082 .create_shm_mmap()
2083 .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2084 self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2085 // All vfio-pci devices should have virtual _PRx method, otherwise
2086 // host couldn't know whether device has enter into suspend state,
2087 // host would always think it is in active state, so its parent PCIe
2088 // switch couldn't enter into suspend state.
2089 PowerResourceMethod {}.to_aml_bytes(&mut amls);
2090 // TODO: WIP: Ideally, we should generate DSM only if the physical
2091 // device has a _DSM; however, such information is not provided by
2092 // Linux. As a temporary workaround, we chech whether there is an
2093 // associated ACPI companion device node and skip generating guest
2094 // _DSM if there is none.
2095 let acpi_path = self.sysfs_path.join("firmware_node/path");
2096 if acpi_path.exists() {
2097 DsmMethod {}.to_aml_bytes(&mut amls);
2098 }
2099 }
2100 }
2101
2102 (amls, shm)
2103 }
2104
set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32>2105 fn set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32> {
2106 if let Some(gpe_nr) = resources.allocate_gpe() {
2107 base::debug!("set_gpe: gpe-nr {} addr {:?}", gpe_nr, self.pci_address);
2108 self.gpe = Some(gpe_nr);
2109 }
2110 self.gpe
2111 }
2112 }
2113
2114 impl Suspendable for VfioPciDevice {
sleep(&mut self) -> anyhow::Result<()>2115 fn sleep(&mut self) -> anyhow::Result<()> {
2116 if let Some(worker_thread) = self.worker_thread.take() {
2117 let res = worker_thread.stop();
2118 self.pci_address = Some(res.address);
2119 self.sysfs_path = res.sysfs_path;
2120 self.pm_cap = res.pm_cap;
2121 self.msix_cap = res.msix_cap;
2122 self.vm_socket_vm = Some(res.vm_socket);
2123 }
2124 Ok(())
2125 }
2126
wake(&mut self) -> anyhow::Result<()>2127 fn wake(&mut self) -> anyhow::Result<()> {
2128 if self.activated {
2129 self.start_work_thread();
2130 }
2131 Ok(())
2132 }
2133 }
2134
2135 #[cfg(test)]
2136 mod tests {
2137 use resources::AddressRange;
2138
2139 use super::VfioResourceAllocator;
2140
2141 #[test]
no_overlap()2142 fn no_overlap() {
2143 // regions [32, 95]
2144 let mut memory =
2145 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2146 memory
2147 .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2148 .unwrap();
2149 memory
2150 .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2151 .unwrap();
2152
2153 let mut iter = memory.regions.iter();
2154 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2155 }
2156
2157 #[test]
complete_overlap()2158 fn complete_overlap() {
2159 // regions [32, 95]
2160 let mut memory =
2161 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2162 // regions [32, 47], [64, 95]
2163 memory
2164 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2165 .unwrap();
2166 // regions [64, 95]
2167 memory
2168 .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2169 .unwrap();
2170
2171 let mut iter = memory.regions.iter();
2172 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2173 }
2174
2175 #[test]
partial_overlap_one()2176 fn partial_overlap_one() {
2177 // regions [32, 95]
2178 let mut memory =
2179 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2180 // regions [32, 47], [64, 95]
2181 memory
2182 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2183 .unwrap();
2184 // regions [32, 39], [64, 95]
2185 memory
2186 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2187 .unwrap();
2188
2189 let mut iter = memory.regions.iter();
2190 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2191 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2192 }
2193
2194 #[test]
partial_overlap_two()2195 fn partial_overlap_two() {
2196 // regions [32, 95]
2197 let mut memory =
2198 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2199 // regions [32, 47], [64, 95]
2200 memory
2201 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2202 .unwrap();
2203 // regions [32, 39], [72, 95]
2204 memory
2205 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2206 .unwrap();
2207
2208 let mut iter = memory.regions.iter();
2209 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2210 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2211 }
2212
2213 #[test]
partial_overlap_three()2214 fn partial_overlap_three() {
2215 // regions [32, 95]
2216 let mut memory =
2217 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2218 // regions [32, 39], [48, 95]
2219 memory
2220 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2221 .unwrap();
2222 // regions [32, 39], [48, 63], [72, 95]
2223 memory
2224 .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2225 .unwrap();
2226 // regions [32, 35], [76, 95]
2227 memory
2228 .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2229 .unwrap();
2230
2231 let mut iter = memory.regions.iter();
2232 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2233 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2234 }
2235 }
2236